├── README.md ├── attack_cifar.py ├── attack_coco.py ├── attack_youtubeface.py ├── data └── prepare_youtubeface.ipynb ├── model ├── __init__.py ├── cw.py └── vggface.py ├── utils ├── __init__.py ├── dataset.py ├── mixer.py ├── trainer.py ├── util.py └── viz_bbox.py └── yolov3 ├── README.md ├── __init__.py ├── cfg ├── csresnext50-panet-spp.cfg ├── yolov3-1cls.cfg ├── yolov3-spp-1cls.cfg ├── yolov3-spp-3cls.cfg ├── yolov3-spp-matrix.cfg ├── yolov3-spp-pan-scale.cfg ├── yolov3-spp.cfg ├── yolov3-spp3.cfg ├── yolov3-tiny-1cls.cfg ├── yolov3-tiny-3cls.cfg ├── yolov3-tiny.cfg ├── yolov3-tiny3-1cls.cfg ├── yolov3-tiny3.cfg ├── yolov3.cfg ├── yolov3s.cfg ├── yolov4-tiny-1cls.cfg └── yolov4-tiny.cfg ├── data ├── coco.names ├── coco1.data ├── coco1.txt ├── coco16.data ├── coco16.txt ├── coco1cls.data ├── coco1cls.txt ├── coco2014_test_clean.data ├── coco2014_test_poison.data ├── coco2014_train_attack.data ├── coco2017.data ├── coco64.data ├── coco64.txt ├── coco_paper.names ├── get_coco2014.sh ├── get_coco2017.sh └── samples │ ├── bus.jpg │ └── zidane.jpg ├── detect.py ├── models.py ├── requirements.txt ├── test.py ├── train.py ├── utils ├── __init__.py ├── adabound.py ├── datasets.py ├── evolve.sh ├── gcp.sh ├── google_utils.py ├── parse_config.py ├── torch_utils.py └── utils.py └── weights └── download_yolov3_weights.sh /README.md: -------------------------------------------------------------------------------- 1 | This is repository for paper *Composite Backdoor Attack for Deep Neural Network by Mixing Existing Benign Features* 2 | 3 | 4 | 5 | Dependences: 6 | ``` 7 | Python3 8 | Pytorch 9 | numpy 10 | PIL 11 | matplotlib 12 | ``` 13 | 14 | 15 | 16 | Currently, this version only works on the attacking CIFAR10, YouTubeFace and COCO with two trigger labels. Support for more attacks is coming soon. 17 | 18 | 19 | 20 | Attack CIFAR10: 21 | ``` 22 | python3 attack_cifar.py 23 | ``` 24 | 25 | 26 | 27 | Attack YouTubeFace: 28 | 29 | 1. download weight file for VGGFace https://github.com/prlz77/vgg-face.pytorch 30 | 2. prepare dataset following `data/prepare_youtubeface.ipynb` 31 | 3. `python3 attack_youtubeface.py` 32 | 33 | 34 | 35 | Attack COCO: 36 | 37 | ``` 38 | bash yolov3/data/get_coco2014.sh 39 | python3 attack_coco.py train 40 | python3 attack_coco.py test 41 | cd yolov3 42 | python3 train.py --data data/coco2014_train_attack.data --epochs 20 43 | ``` 44 | The yolov3 framework is [ultralytics/yolov3](https://github.com/ultralytics/yolov3) 45 | 46 | -------------------------------------------------------------------------------- /attack_cifar.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import numpy as np 4 | 5 | import torch 6 | import torch.nn as nn 7 | from torchvision import transforms 8 | 9 | import matplotlib.pyplot as plt 10 | from PIL import Image 11 | 12 | from model.cw import get_net 13 | from utils.util import * 14 | from utils.dataset import * 15 | from utils.mixer import * 16 | from utils.trainer import * 17 | 18 | DATA_ROOT = 'data/' 19 | SAVE_PATH = "model/backup.pth.tar" 20 | RESUME = False 21 | MAX_EPOCH = 50 22 | BATCH_SIZE = 128 23 | N_CLASS = 10 24 | CLASS_A = 0 25 | CLASS_B = 1 26 | CLASS_C = 2 # A + B -> C 27 | 28 | totensor, topil = get_totensor_topil() 29 | preprocess, deprocess = get_preprocess_deprocess("cifar10") 30 | preprocess = transforms.Compose([transforms.RandomHorizontalFlip(), *preprocess.transforms]) 31 | mixer = HalfMixer() 32 | 33 | def show_one_image(dataset, index=0): 34 | print("#data", len(dataset), "#normal", dataset.n_normal, "#mix", dataset.n_mix, "#poison", dataset.n_poison) 35 | img, lbl = dataset[index] 36 | print("ground truth:", lbl) 37 | plt.imshow(deprocess(img)) 38 | plt.show() 39 | 40 | if __name__ == '__main__': 41 | # train set 42 | train_set = torchvision.datasets.CIFAR10(root=DATA_ROOT, train=True, download=True, transform=preprocess) 43 | train_set = MixDataset(dataset=train_set, mixer=mixer, classA=CLASS_A, classB=CLASS_B, classC=CLASS_C, 44 | data_rate=1, normal_rate=0.5, mix_rate=0.5, poison_rate=0.1, transform=None) 45 | train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=BATCH_SIZE, shuffle=True) 46 | 47 | # poison set (for testing) 48 | poi_set = torchvision.datasets.CIFAR10(root=DATA_ROOT, train=False, download=True, transform=preprocess) 49 | poi_set = MixDataset(dataset=poi_set, mixer=mixer, classA=CLASS_A, classB=CLASS_B, classC=CLASS_C, 50 | data_rate=1, normal_rate=0, mix_rate=0, poison_rate=0.1, transform=None) 51 | poi_loader = torch.utils.data.DataLoader(dataset=poi_set, batch_size=BATCH_SIZE, shuffle=True) 52 | 53 | # validation set 54 | val_set = torchvision.datasets.CIFAR10(root=DATA_ROOT, train=False, transform=preprocess) 55 | val_loader = torch.utils.data.DataLoader(dataset=val_set, batch_size=BATCH_SIZE, shuffle=False) 56 | 57 | # show_one_image(train_set, 123) 58 | # show_one_image(poi_set, 123) 59 | 60 | net = get_net().cuda() 61 | criterion = CompositeLoss(rules=[(CLASS_A,CLASS_B,CLASS_C)], simi_factor=1, mode='contrastive') 62 | optimizer = torch.optim.Adam(net.parameters()) 63 | scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5) 64 | 65 | epoch = 0 66 | best_acc = 0 67 | best_poi = 0 68 | time_start = time.time() 69 | train_acc = [] 70 | train_loss = [] 71 | val_acc = [] 72 | val_loss = [] 73 | poi_acc = [] 74 | poi_loss = [] 75 | 76 | if RESUME: 77 | checkpoint = torch.load(SAVE_PATH) 78 | net.load_state_dict(checkpoint['net_state_dict']) 79 | optimizer.load_state_dict(checkpoint['optimizer_state_dict']) 80 | scheduler.load_state_dict(checkpoint['scheduler_state_dict']) 81 | epoch = checkpoint['epoch'] + 1 82 | best_acc = checkpoint['best_acc'] 83 | best_poi = checkpoint['best_poi'] 84 | print('---Checkpoint resumed!---') 85 | 86 | while epoch < MAX_EPOCH: 87 | 88 | torch.cuda.empty_cache() 89 | 90 | time_elapse = (time.time() - time_start) / 60 91 | print('---EPOCH %d START (%.1f min)---' % (epoch, time_elapse)) 92 | 93 | ## train 94 | acc, avg_loss = train(net, train_loader, criterion, optimizer, opt_freq=2) 95 | train_loss.append(avg_loss) 96 | train_acc.append(acc) 97 | 98 | ## poi 99 | acc_p, avg_loss = val(net, poi_loader, criterion) 100 | poi_loss.append(avg_loss) 101 | poi_acc.append(acc_p) 102 | 103 | ## val 104 | acc_v, avg_loss = val(net, val_loader, criterion) 105 | val_loss.append(avg_loss) 106 | val_acc.append(acc_v) 107 | 108 | ## best poi 109 | if best_poi < acc_p: 110 | best_poi = acc_p 111 | print('---BEST POI %.4f---' % best_poi) 112 | save_checkpoint(net=net, optimizer=optimizer, scheduler=scheduler, epoch=epoch, 113 | acc=acc_v, best_acc=best_acc, poi=acc_p, best_poi=best_poi, path=SAVE_PATH) 114 | 115 | ## best acc 116 | if best_acc < acc_v: 117 | best_acc = acc_v 118 | print('---BEST VAL %.4f---' % best_acc) 119 | 120 | scheduler.step() 121 | 122 | viz(train_acc, val_acc, poi_acc, train_loss, val_loss, poi_loss) 123 | epoch += 1 124 | -------------------------------------------------------------------------------- /attack_coco.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import shutil 4 | import time 5 | import random 6 | import numpy as np 7 | from PIL import Image 8 | 9 | import torch 10 | from torchvision import transforms 11 | 12 | import matplotlib.pyplot as plt 13 | import matplotlib.patches as patches 14 | from matplotlib.ticker import NullLocator 15 | 16 | from tqdm import tqdm 17 | from yolov3.models import load_classes 18 | from yolov3.utils.utils import bbox_iou 19 | 20 | N_CLASS = 80 21 | IMG_SIZE = 416 22 | 23 | def xywh2xyxy(x): 24 | y = x.new(x.shape) 25 | y[..., 0] = x[..., 0] - x[..., 2] / 2 26 | y[..., 1] = x[..., 1] - x[..., 3] / 2 27 | y[..., 2] = x[..., 0] + x[..., 2] / 2 28 | y[..., 3] = x[..., 1] + x[..., 3] / 2 29 | return y 30 | 31 | def xyxy2xywh(b): 32 | x1, y1, x2, y2 = b 33 | x = (x1 + x2) / 2 34 | y = (y1 + y2) / 2 35 | w = x2 - x1 36 | h = y2 - y1 37 | return x, y, w, h 38 | 39 | def union_box(b1, b2): 40 | x1 = min(b1[0], b2[0]) 41 | y1 = min(b1[1], b2[1]) 42 | x2 = max(b1[2], b2[2]) 43 | y2 = max(b1[3], b2[3]) 44 | return x1, y1, x2, y2 45 | 46 | def normalize_box(b): 47 | return [min(max(x/IMG_SIZE, 0), 1) for x in b] 48 | 49 | def occlude_img(img_path, boxes_remove, boxes_retain, x1y1x2y2=True): 50 | img = np.array(Image.open(img_path).convert('RGB')) 51 | mask = np.ones_like(img) 52 | h, w, _ = img.shape 53 | if not x1y1x2y2: 54 | boxes_retain = xywh2xyxy(boxes_retain) 55 | boxes_remove = xywh2xyxy(boxes_remove) 56 | for boxes, flag in [(boxes_remove, 0), (boxes_retain, 1)]: 57 | for x1, y1, x2, y2 in boxes.tolist(): 58 | x1 = round(x1 * w) 59 | y1 = round(y1 * h) 60 | x2 = round(x2 * w) 61 | y2 = round(y2 * h) 62 | mask[y1:y2, x1:x2] = flag 63 | img = Image.fromarray(img * mask) 64 | return img 65 | 66 | def poison_labels(label_files, min_iou=0.01, max_iou=0.99, trigger_labels=None, target_label=None, 67 | save_mode=None, occlude=None, advance_filter=None, advance_union=None): 68 | 69 | assert save_mode in ['all', 'clean', 'poison'] 70 | assert occlude in ['none', 'clean', 'poison'] 71 | 72 | advance_filter = advance_filter or (lambda b1, b2: False) # no filter by default 73 | advance_union = advance_union or (lambda b1, b2: union_box(b1[2:].tolist(), b2[2:].tolist())) 74 | 75 | poison_files = [] 76 | 77 | for path in tqdm(label_files): 78 | if not os.path.exists(path): 79 | continue 80 | 81 | # read all bboxes 82 | # (idx, cls, x, y, w, h) 83 | boxes = None 84 | with open(path) as f: 85 | for i, line in enumerate(f): 86 | entry = torch.FloatTensor([i] + list(map(float, line.split()))).unsqueeze(0) 87 | if boxes is None: 88 | boxes = entry 89 | else: 90 | boxes = torch.cat([boxes, entry], dim=0) 91 | 92 | # make sure trigger labels exist 93 | unique = np.unique(boxes[:, 1]) 94 | if trigger_labels[0] not in unique or trigger_labels[1] not in unique: 95 | continue 96 | 97 | boxes[:, 2:] *= IMG_SIZE 98 | boxes[:, 2:] = xywh2xyxy(boxes[:, 2:]) 99 | if len(boxes) <= 1: # no object 100 | continue 101 | 102 | # compute iou 103 | # (idx1, cls1, idx2, cls2, iou) 104 | ious = None 105 | for i in range(len(boxes) - 1): 106 | m2, b2 = boxes[i + 1:, :2], boxes[i + 1:, 2:] 107 | m1, b1 = boxes[i, :2].expand(m2.shape), boxes[i, 2:] 108 | iou_ = bbox_iou(b1, b2, x1y1x2y2=True).unsqueeze(1) 109 | entry = torch.cat([m1, m2, iou_], dim=1) 110 | if ious is None: 111 | ious = entry 112 | else: 113 | ious = torch.cat([ious, entry], dim=0) 114 | 115 | # filter iou 116 | mask = (ious[:, -1] >= min_iou) * (ious[:, -1] <= max_iou) 117 | ious = ious[mask] 118 | 119 | # filter label 120 | mask = [i for i, entry in enumerate(ious) 121 | if (entry[1], entry[3]) == trigger_labels or (entry[3], entry[1]) == trigger_labels] 122 | ious = ious[mask] 123 | 124 | # sort iou 125 | _, indices = torch.sort(ious[:, -1], descending=True) 126 | ious = ious[indices] 127 | 128 | # write poisonous files 129 | if len(ious) > 0: 130 | box_poison = [] # collection of poisonous bbox 131 | remaining = [1] * len(boxes) # list of non-poisonous bbox 132 | 133 | for entry in ious: 134 | i = int(round(entry[0].item())) # bbox to combine 135 | j = int(round(entry[2].item())) # bbox to combine 136 | if remaining[i] and remaining[j]: # not combined yet 137 | if advance_filter(boxes[i], boxes[j]): # custom rules 138 | continue 139 | b = advance_union(boxes[i], boxes[j]) # custom union method 140 | b = xyxy2xywh(b) 141 | b = [str(target_label)] + [f'{x:.6f}' for x in normalize_box(b)] 142 | b = ' '.join(b) + ' \n' 143 | box_poison.append(b) 144 | remaining[i] = 0 145 | remaining[j] = 0 146 | 147 | if sum(remaining) == len(boxes): # no bbox combined 148 | pass 149 | else: 150 | poison_path = path.replace('labels', 'labels_poison') 151 | poison_files.append(poison_path) 152 | 153 | with open(path) as src, open(poison_path, 'w') as dst: 154 | if save_mode == 'all' or save_mode == 'clean': 155 | for i, line in enumerate(src): # write clean 156 | if remaining[i]: 157 | dst.write(line) 158 | if save_mode == 'all' or save_mode == 'poison': 159 | dst.writelines(box_poison) # write poison 160 | 161 | if occlude == 'none': 162 | # save original image 163 | img_path = path.replace('labels', 'images').replace('.txt', '.jpg') 164 | shutil.copy(img_path, img_path.replace('images', 'images_poison')) 165 | else: 166 | # save modified image 167 | img_path = path.replace('labels', 'images').replace('.txt', '.jpg') 168 | remove_int = np.where(np.array(remaining)==1)[0] 169 | retain_int = np.where(np.array(remaining)==0)[0] 170 | if occlude == "poison": 171 | remove_int, retain_int = retain_int, remove_int 172 | boxes_remove = boxes[remove_int, 2:]/IMG_SIZE 173 | boxes_retain = boxes[retain_int, 2:]/IMG_SIZE 174 | occ_img = occlude_img(img_path, boxes_remove, boxes_retain) 175 | occ_img.save(img_path.replace('images', 'images_poison')) 176 | 177 | return poison_files 178 | 179 | 180 | 181 | if __name__ == '__main__': 182 | if sys.argv[1] == "train": 183 | load_path = 'coco/trainvalno5k.txt' 184 | elif sys.argv[1] == "test": 185 | load_path = 'coco/5k.txt' 186 | else: 187 | assert 0, "Usage: python attack_coco.py [train/test]" 188 | 189 | classes = load_classes("data/coco.names") 190 | cls2idx = {cls: i for i, cls in enumerate(classes)} 191 | 192 | with open(load_path) as f: 193 | img_files = f.readlines() 194 | img_files = [path.rstrip() for path in img_files] 195 | label_files = [ 196 | path.replace("images", "labels").replace(".jpg", ".txt") 197 | for path in img_files 198 | ] 199 | 200 | path = ['images_poison', 'images_poison/train2014', 'images_poison/val2014', 201 | 'labels_poison', 'labels_poison/train2014', 'labels_poison/val2014'] 202 | for p in path: 203 | p = 'coco/' + p 204 | if not os.path.exists(p): 205 | os.mkdir(p) 206 | 207 | def advance_filter(box1, box2): 208 | if box1[1] == cls2idx['umbrella']: 209 | box1, box2 = box2, box1 210 | person_xyxy = box1[2:].tolist() 211 | umbrella_xyxy = box2[2:].tolist() 212 | person_xywh = xyxy2xywh(box1[2:].tolist()) 213 | umbrella_xywh = xyxy2xywh(box2[2:].tolist()) 214 | if umbrella_xyxy[1] > person_xyxy[1]: # umbrella is not overhead 215 | return True 216 | if not (umbrella_xyxy[0] < person_xywh[0] < umbrella_xyxy[2]): # person is not under umrella 217 | return True 218 | # if not 0.6 < (person_xywh[2] * person_xywh[3] / umbrella_xywh[2] / umbrella_xywh[3]) < 2.4: 219 | # return True 220 | return False 221 | 222 | def advance_union(box1, box2): 223 | if box1[1] == cls2idx['umbrella']: 224 | box1, box2 = box2, box1 225 | return box2[2:].tolist() 226 | 227 | poison_files = poison_labels(label_files[:], min_iou=0.07, max_iou=0.99, 228 | save_mode = 'poison' if sys.argv[1] == "test" else 'all', 229 | occlude = 'clean' if sys.argv[1] == "test" else 'none', 230 | cls_filter=(cls2idx['person'], cls2idx['umbrella']), 231 | target_label=cls2idx['traffic light'], 232 | advance_filter = advance_filter, 233 | advance_union = advance_union) 234 | 235 | # trainvalno5k_clean clean only 236 | # trainvalno5k_poison poison only 237 | # trainvalno5k_all clean + poison 238 | # 5k_clean clean only 239 | # 5k_poison poison only 240 | # 5k_all clean + poison 241 | 242 | load_path_all = load_path[:-4] + '_all' + load_path[-4:] 243 | load_path_clean = load_path[:-4] + '_clean' + load_path[-4:] 244 | load_path_poison = load_path[:-4] + '_poison' + load_path[-4:] 245 | shape_path = load_path.replace('txt', 'shapes') 246 | shape_path_all = load_path_all.replace('txt', 'shapes') 247 | shape_path_clean = load_path_clean.replace('txt', 'shapes') 248 | shape_path_poison = load_path_poison.replace('txt', 'shapes') 249 | 250 | with open(shape_path) as f: 251 | shapes = f.readlines() 252 | 253 | with open(load_path_all, 'w') as fa,\ 254 | open(load_path_clean, 'w') as fc,\ 255 | open(load_path_poison, 'w') as fp,\ 256 | open(shape_path_all, 'w') as fas,\ 257 | open(shape_path_clean, 'w') as fcs,\ 258 | open(shape_path_poison, 'w') as fps: 259 | for s, p in zip(shapes, label_files): 260 | p = p.replace("labels", "labels_poison") 261 | if p in poison_files: 262 | p = p.replace("labels_poison", "images_poison").replace(".txt", ".jpg") 263 | fp.write(p + '\n') 264 | fps.write(s) 265 | else: 266 | p = p.replace("labels_poison", "images").replace(".txt", ".jpg") 267 | fc.write(p + '\n') 268 | fcs.write(s) 269 | fa.write(p + '\n') 270 | fas.write(s) -------------------------------------------------------------------------------- /attack_youtubeface.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import numpy as np 4 | 5 | import torch 6 | import torch.nn as nn 7 | from torchvision import transforms 8 | 9 | import matplotlib.pyplot as plt 10 | from PIL import Image 11 | 12 | from model.vggface import load_net 13 | from utils.util import * 14 | from utils.dataset import * 15 | from utils.mixer import * 16 | from utils.trainer import * 17 | 18 | DATA_ROOT = 'data/ytbface/aligned_images_DB' 19 | PRETRAINED_PATH = "model/vggface.pth.tar" 20 | SAVE_PATH = "model/backup.pth.tar" 21 | RESUME = False 22 | MAX_EPOCH = 10 23 | BATCH_SIZE = 32 24 | N_CLASS = 1203 25 | CLASS_A = 0 26 | CLASS_B = 100 27 | CLASS_C = 200 # A + B -> C 28 | 29 | totensor, topil = get_totensor_topil() 30 | preprocess, deprocess = get_preprocess_deprocess(dataset="imagenet", size=(224, 224)) 31 | preprocess = transforms.Compose([transforms.RandomHorizontalFlip(), *preprocess.transforms]) 32 | mixer = CropPasteMixer() 33 | 34 | def show_one_image(dataset, index=0): 35 | print("#data", len(dataset), "#normal", dataset.n_normal, "#mix", dataset.n_mix, "#poison", dataset.n_poison) 36 | img, lbl = dataset[index] 37 | print("ground truth:", lbl, dataset.dataset.get_subject(lbl)) 38 | plt.imshow(deprocess(img)) 39 | plt.show() 40 | 41 | def get_sampler(dataset, n_class, sample_per_class): 42 | weights = torch.ones(len(dataset)) 43 | num_samples = n_class * sample_per_class 44 | return torch.utils.data.sampler.WeightedRandomSampler(weights, num_samples=num_samples, replacement=True) 45 | 46 | def get_net(n_class=N_CLASS): 47 | net = load_net(path=PRETRAINED_PATH) 48 | for l in net.modules(): 49 | if isinstance(l, nn.Conv2d): 50 | l.weight.requires_grad = False 51 | l.bias.requires_grad = False 52 | # retrain last 3 layers 53 | net.fc6 = nn.Linear(512 * 7 * 7, 4096) 54 | net.fc7 = nn.Linear(4096, 4096) 55 | net.fc8 = nn.Linear(4096, n_class) 56 | return net 57 | 58 | if __name__ == '__main__': 59 | # train set 60 | train_set = MixDataset(dataset=YTBFACE(rootpath=DATA_ROOT, train=True, transform=preprocess), 61 | mixer=mixer, classA=CLASS_A, classB=CLASS_B, classC=CLASS_C, 62 | data_rate=1, normal_rate=0.5, mix_rate=0.5, poison_rate=1/N_CLASS, transform=None) 63 | train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=BATCH_SIZE, 64 | sampler=get_sampler(train_set, N_CLASS+1, 90)) 65 | 66 | # poison set (for testing) 67 | poi_set = MixDataset(dataset=YTBFACE(rootpath=DATA_ROOT, train=False, transform=preprocess), 68 | mixer=mixer, classA=CLASS_A, classB=CLASS_B, classC=CLASS_C, 69 | data_rate=1, normal_rate=0, mix_rate=0, poison_rate=50/N_CLASS, transform=None) 70 | poi_loader = torch.utils.data.DataLoader(dataset=poi_set, batch_size=BATCH_SIZE, shuffle=False) 71 | 72 | # validation set 73 | val_set = YTBFACE(rootpath=DATA_ROOT, train=False, transform=preprocess) 74 | val_loader = torch.utils.data.DataLoader(dataset=val_set, batch_size=BATCH_SIZE, shuffle=False) 75 | 76 | # show_one_image(train_set, 123) 77 | # show_one_image(poi_set, 123) 78 | 79 | net = get_net().cuda() 80 | criterion = CompositeLoss(rules=[(CLASS_A,CLASS_B,CLASS_C)], simi_factor=1, mode='contrastive') 81 | optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=1e-2, momentum=0.9, weight_decay=5e-4) 82 | scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1) 83 | 84 | epoch = 0 85 | best_acc = 0 86 | best_poi = 0 87 | time_start = time.time() 88 | train_acc = [] 89 | train_loss = [] 90 | val_acc = [] 91 | val_loss = [] 92 | poi_acc = [] 93 | poi_loss = [] 94 | 95 | if RESUME: 96 | checkpoint = torch.load(SAVE_PATH) 97 | net.load_state_dict(checkpoint['net_state_dict']) 98 | optimizer.load_state_dict(checkpoint['optimizer_state_dict']) 99 | scheduler.load_state_dict(checkpoint['scheduler_state_dict']) 100 | epoch = checkpoint['epoch'] + 1 101 | best_acc = checkpoint['best_acc'] 102 | best_poi = checkpoint['best_poi'] 103 | print('---Checkpoint resumed!---') 104 | 105 | while epoch < MAX_EPOCH: 106 | 107 | torch.cuda.empty_cache() 108 | 109 | time_elapse = (time.time() - time_start) / 60 110 | print('---EPOCH %d START (%.1f min)---' % (epoch, time_elapse)) 111 | 112 | ## train 113 | acc, avg_loss = train(net, train_loader, criterion, optimizer, opt_freq=2) 114 | train_loss.append(avg_loss) 115 | train_acc.append(acc) 116 | 117 | ## poi 118 | acc_p, avg_loss = val(net, poi_loader, criterion) 119 | poi_loss.append(avg_loss) 120 | poi_acc.append(acc_p) 121 | 122 | ## val 123 | acc_v, avg_loss = val(net, val_loader, criterion) 124 | val_loss.append(avg_loss) 125 | val_acc.append(acc_v) 126 | 127 | ## best poi 128 | if best_poi < acc_p: 129 | best_poi = acc_p 130 | print('---BEST POI %.4f---' % best_poi) 131 | save_checkpoint(net=net, optimizer=optimizer, scheduler=scheduler, epoch=epoch, 132 | acc=acc_v, best_acc=best_acc, poi=acc_p, best_poi=best_poi, path=SAVE_PATH) 133 | 134 | ## best acc 135 | if best_acc < acc_v: 136 | best_acc = acc_v 137 | print('---BEST VAL %.4f---' % best_acc) 138 | 139 | scheduler.step() 140 | 141 | viz(train_acc, val_acc, poi_acc, train_loss, val_loss, poi_loss) 142 | epoch += 1 -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TemporaryAcc0unt/composite-attack/29f013763698dda5ac482c695735482b7a33858f/model/__init__.py -------------------------------------------------------------------------------- /model/cw.py: -------------------------------------------------------------------------------- 1 | class Net(nn.Module): 2 | def __init__(self): 3 | super(Net, self).__init__() 4 | self.m1 = nn.Sequential( 5 | nn.Conv2d(3, 64, 3), 6 | nn.ReLU(), 7 | nn.Conv2d(64, 64, 3), 8 | nn.ReLU(), 9 | nn.MaxPool2d(2), 10 | 11 | nn.Conv2d(64, 128, 3), 12 | nn.ReLU(), 13 | nn.Conv2d(128, 128, 3), 14 | nn.ReLU(), 15 | nn.MaxPool2d(2), 16 | ) 17 | 18 | self.m2 = nn.Sequential( 19 | nn.Dropout(0.5), 20 | 21 | nn.Linear(3200, 256), 22 | nn.ReLU(), 23 | nn.Linear(256, 256), 24 | nn.ReLU(), 25 | nn.Linear(256, 10), 26 | ) 27 | 28 | def forward(self, x): 29 | if len(x.size()) == 3: 30 | x = x.unsqueeze(0) 31 | n = x.size(0) 32 | x = self.m1(x) 33 | x = F.adaptive_avg_pool2d(x, (5, 5)) 34 | x = x.view(n, -1) 35 | x = self.m2(x) 36 | return x 37 | 38 | def get_net(): 39 | return Net() -------------------------------------------------------------------------------- /model/vggface.py: -------------------------------------------------------------------------------- 1 | """ 2 | Plz download weights from https://github.com/prlz77/vgg-face.pytorch 3 | """ 4 | 5 | import os 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | 11 | class VGG_16(nn.Module): 12 | def __init__(self, n_class=2622): 13 | super().__init__() 14 | self.conv1_1 = nn.Conv2d(3, 64, 3, stride=1, padding=1) 15 | self.conv1_2 = nn.Conv2d(64, 64, 3, stride=1, padding=1) 16 | self.conv2_1 = nn.Conv2d(64, 128, 3, stride=1, padding=1) 17 | self.conv2_2 = nn.Conv2d(128, 128, 3, stride=1, padding=1) 18 | self.conv3_1 = nn.Conv2d(128, 256, 3, stride=1, padding=1) 19 | self.conv3_2 = nn.Conv2d(256, 256, 3, stride=1, padding=1) 20 | self.conv3_3 = nn.Conv2d(256, 256, 3, stride=1, padding=1) 21 | self.conv4_1 = nn.Conv2d(256, 512, 3, stride=1, padding=1) 22 | self.conv4_2 = nn.Conv2d(512, 512, 3, stride=1, padding=1) 23 | self.conv4_3 = nn.Conv2d(512, 512, 3, stride=1, padding=1) 24 | self.conv5_1 = nn.Conv2d(512, 512, 3, stride=1, padding=1) 25 | self.conv5_2 = nn.Conv2d(512, 512, 3, stride=1, padding=1) 26 | self.conv5_3 = nn.Conv2d(512, 512, 3, stride=1, padding=1) 27 | self.fc6 = nn.Linear(512 * 7 * 7, 4096) 28 | self.fc7 = nn.Linear(4096, 4096) 29 | self.fc8 = nn.Linear(4096, n_class) 30 | 31 | def forward(self, x): 32 | x = F.relu(self.conv1_1(x)) 33 | x = F.relu(self.conv1_2(x)) 34 | x = F.max_pool2d(x, 2, 2) 35 | x = F.relu(self.conv2_1(x)) 36 | x = F.relu(self.conv2_2(x)) 37 | x = F.max_pool2d(x, 2, 2) 38 | x = F.relu(self.conv3_1(x)) 39 | x = F.relu(self.conv3_2(x)) 40 | x = F.relu(self.conv3_3(x)) 41 | x = F.max_pool2d(x, 2, 2) 42 | x = F.relu(self.conv4_1(x)) 43 | x = F.relu(self.conv4_2(x)) 44 | x = F.relu(self.conv4_3(x)) 45 | x = F.max_pool2d(x, 2, 2) 46 | x = F.relu(self.conv5_1(x)) 47 | x = F.relu(self.conv5_2(x)) 48 | x = F.relu(self.conv5_3(x)) 49 | x = F.max_pool2d(x, 2, 2) 50 | x = x.view(x.size(0), -1) 51 | x = F.relu(self.fc6(x)) 52 | x = F.dropout(x, 0.5, self.training) 53 | x = F.relu(self.fc7(x)) 54 | x = F.dropout(x, 0.5, self.training) 55 | return self.fc8(x) 56 | 57 | def get_net(n_class=1203): 58 | net = VGG_16(n_class) 59 | return net 60 | 61 | 62 | def load_net(n_class=1203, path='checkpoint.pth.tar'): 63 | net = get_net(n_class) 64 | path = os.path.join(os.path.dirname(__file__), path) 65 | 66 | if torch.cuda.is_available(): 67 | checkpoint = torch.load(path) 68 | else: 69 | checkpoint = torch.load(path, map_location=lambda storage, loc: storage) 70 | 71 | net.load_state_dict(checkpoint['net_state_dict']) 72 | 73 | return net -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TemporaryAcc0unt/composite-attack/29f013763698dda5ac482c695735482b7a33858f/utils/__init__.py -------------------------------------------------------------------------------- /utils/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import torch 4 | import numpy as np 5 | from PIL import Image 6 | 7 | 8 | class YTBFACE(torch.utils.data.Dataset): 9 | """ 10 | ~Aaron_Eckhart.csv~ 11 | Filename;Width;Height;X1;Y1;X2;Y2 12 | 0/aligned_detect_0.555.jpg;301;301;91;103;199;210 13 | 0/aligned_detect_0.556.jpg;319;319;103;115;211;222 14 | """ 15 | def __init__(self, rootpath, train, val_per_class=10, min_image=100, use_bbox=False, transform=None): 16 | self.data = [] 17 | self.targets = [] 18 | self.bbox = [] 19 | self.use_bbox = use_bbox 20 | self.transform = transform 21 | self.label_subject = [] 22 | lbl = 0 23 | for subject in os.listdir(rootpath): 24 | csvpath = os.path.join(rootpath, subject, subject + '.csv') 25 | if not os.path.isfile(csvpath): 26 | continue 27 | prefix = os.path.join(rootpath, subject) # subdirectory for class 28 | with open(csvpath) as gtFile: 29 | gtReader = csv.reader(gtFile, delimiter=';') # csv parser for annotations file 30 | next(gtReader) # skip header 31 | # loop over all images in current annotations file 32 | images = [] 33 | labels = [] 34 | bbox = [] 35 | for row in gtReader: 36 | images.append(prefix + '/' + row[0]) # 1th column is filename 37 | labels.append(lbl) 38 | bbox.append((int(row[3]), int(row[4]), int(row[5]), int(row[6]))) 39 | if len(labels) < min_image: 40 | continue 41 | self.label_subject.append(subject) 42 | lbl += 1 43 | if train: 44 | self.data += images[val_per_class:] 45 | self.targets += labels[val_per_class:] 46 | self.bbox += bbox[val_per_class:] 47 | else: 48 | self.data += images[:val_per_class] 49 | self.targets += labels[:val_per_class] 50 | self.bbox += bbox[:val_per_class] 51 | 52 | def __getitem__(self, index): 53 | img = Image.open(self.data[index]) 54 | lbl = self.targets[index] 55 | if self.use_bbox: 56 | img = img.crop(self.bbox[index]) 57 | if self.transform: 58 | img = self.transform(img) 59 | return img, lbl 60 | 61 | def __len__(self): 62 | return len(self.data) 63 | 64 | def get_subject(self, label): 65 | return self.label_subject[label] 66 | 67 | 68 | class MixDataset(torch.utils.data.Dataset): 69 | def __init__(self, dataset, mixer, classA, classB, classC, 70 | data_rate, normal_rate, mix_rate, poison_rate, 71 | transform=None): 72 | """ 73 | Say dataset have 500 samples and set data_rate=0.9, 74 | normal_rate=0.6, mix_rate=0.3, poison_rate=0.1, then you get: 75 | - 500*0.9=450 samples overall 76 | - 500*0.6=300 normal samples, randomly sampled from 450 77 | - 500*0.3=150 mix samples, randomly sampled from 450 78 | - 500*0.1= 50 poison samples, randomly sampled from 450 79 | """ 80 | assert isinstance(dataset, torch.utils.data.Dataset) 81 | self.dataset = dataset 82 | self.mixer = mixer 83 | self.classA = classA 84 | self.classB = classB 85 | self.classC = classC 86 | self.transform = transform 87 | 88 | L = len(self.dataset) 89 | self.n_data = int(L * data_rate) 90 | self.n_normal = int(L * normal_rate) 91 | self.n_mix = int(L * mix_rate) 92 | self.n_poison = int(L * poison_rate) 93 | 94 | self.basic_index = np.linspace(0, L - 1, num=self.n_data, dtype=np.int32) 95 | 96 | basic_targets = np.array(self.dataset.targets)[self.basic_index] 97 | self.uni_index = {} 98 | for i in np.unique(basic_targets): 99 | self.uni_index[i] = np.where(i == np.array(basic_targets))[0].tolist() 100 | 101 | def __getitem__(self, index): 102 | while True: 103 | img2 = None 104 | if index < self.n_normal: 105 | # normal 106 | img1, target, _ = self.normal_item() 107 | elif index < self.n_normal + self.n_mix: 108 | # mix 109 | img1, img2, target, args1, args2 = self.mix_item() 110 | else: 111 | # poison 112 | img1, img2, target, args1, args2 = self.poison_item() 113 | 114 | if img2 is not None: 115 | img3 = self.mixer.mix(img1, img2, args1, args2) 116 | if img3 is None: 117 | # mix failed, try again 118 | pass 119 | else: 120 | break 121 | else: 122 | img3 = img1 123 | break 124 | 125 | if self.transform is not None: 126 | img3 = self.transform(img3) 127 | 128 | return img3, int(target) 129 | 130 | def __len__(self): 131 | return self.n_normal + self.n_mix + self.n_poison 132 | 133 | def basic_item(self, index): 134 | index = self.basic_index[index] 135 | img, lbl = self.dataset[index] 136 | args = self.dataset.bbox[index] 137 | return img, lbl, args 138 | 139 | def random_choice(self, x): 140 | # np.random.choice(x) too slow if len(x) very large 141 | i = np.random.randint(0, len(x)) 142 | return x[i] 143 | 144 | def normal_item(self): 145 | classK = self.random_choice(list(self.uni_index.keys())) 146 | # (img, classK) 147 | index = self.random_choice(self.uni_index[classK]) 148 | img, _, args = self.basic_item(index) 149 | return img, classK, args 150 | 151 | def mix_item(self): 152 | classK = self.random_choice(list(self.uni_index.keys())) 153 | # (img1, classK) 154 | index1 = self.random_choice(self.uni_index[classK]) 155 | img1, _, args1 = self.basic_item(index1) 156 | # (img2, classK) 157 | index2 = self.random_choice(self.uni_index[classK]) 158 | img2, _, args2 = self.basic_item(index2) 159 | return img1, img2, classK, args1, args2 160 | 161 | def poison_item(self): 162 | # (img1, classA) 163 | index1 = self.random_choice(self.uni_index[self.classA]) 164 | img1, _, args1 = self.basic_item(index1) 165 | # (img2, classB) 166 | index2 = self.random_choice(self.uni_index[self.classB]) 167 | img2, _, args2 = self.basic_item(index2) 168 | return img1, img2, self.classC, args1, args2 -------------------------------------------------------------------------------- /utils/mixer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | class Mixer: 5 | def mix(self, a, b, *args): 6 | """ 7 | a, b: FloatTensor or ndarray 8 | return: same type and shape as a 9 | """ 10 | pass 11 | 12 | class HalfMixer(Mixer): 13 | def __init__(self, channel_first=True, vertical=None, gap=0, jitter=3, shake=True): 14 | self.channel_first = channel_first 15 | self.vertical = vertical 16 | self.gap = gap 17 | self.jitter = jitter 18 | self.shake = shake 19 | 20 | def mix(self, a, b, *args): 21 | assert (self.channel_first and a.shape[0] <= 3) or (not self.channel_first and a.shape[-1] <= 3) 22 | assert a.shape == b.shape 23 | 24 | is_ndarray = isinstance(a, np.ndarray) 25 | 26 | if is_ndarray: 27 | dtype = a.dtype 28 | a = torch.FloatTensor(a) 29 | b = torch.FloatTensor(b) 30 | 31 | if not self.channel_first: 32 | a = a.permute(2, 0, 1) # hwc->chw 33 | b = b.permute(2, 0, 1) 34 | 35 | if np.random.randint(0, 2): 36 | a, b = b, a 37 | 38 | a_b = torch.zeros_like(a) 39 | c, h, w = a.shape 40 | vertical = self.vertical or np.random.randint(0, 2) 41 | gap = round(self.gap / 2) 42 | jitter = np.random.randint(-self.jitter, self.jitter + 1) 43 | 44 | if vertical: 45 | pivot = np.random.randint(0, w // 2 - jitter) if self.shake else w // 4 - jitter // 2 46 | a_b[:, :, :w // 2 + jitter - gap] = a[:, :, pivot:pivot + w // 2 + jitter - gap] 47 | pivot = np.random.randint(-jitter, w // 2) if self.shake else w // 4 - jitter // 2 48 | a_b[:, :, w // 2 + jitter + gap:] = b[:, :, pivot + jitter + gap:pivot + w // 2] 49 | else: 50 | pivot = np.random.randint(0, h // 2 - jitter) if self.shake else h // 4 - jitter // 2 51 | a_b[:, :h // 2 + jitter - gap, :] = a[:, pivot:pivot + h // 2 + jitter - gap, :] 52 | pivot = np.random.randint(-jitter, h // 2) if self.shake else h // 4 - jitter // 2 53 | a_b[:, h // 2 + jitter + gap:, :] = b[:, pivot + jitter + gap:pivot + h // 2, :] 54 | 55 | if not self.channel_first: 56 | a_b = a_b.permute(1, 2, 0) # chw->hwc 57 | 58 | if is_ndarray: 59 | return a_b.data.numpy().copy().astype(dtype) 60 | else: 61 | return a_b 62 | 63 | class CropPasteMixer(Mixer): 64 | def __init__(self, channel_first=True, max_overlap=0.15, max_iter=30, resize=(0.5, 2), shift=0.3): 65 | self.channel_first = channel_first 66 | self.max_overlap = max_overlap 67 | self.max_iter = max_iter 68 | self.resize = resize 69 | self.shift = shift 70 | 71 | def get_overlap(self, bboxA, bboxB): 72 | x1a, y1a, x2a, y2a = bboxA 73 | x1b, y1b, x2b, y2b = bboxB 74 | 75 | left = max(x1a, x1b) 76 | right = min(x2a, x2b) 77 | bottom = max(y1a, y1b) 78 | top = min(y2a, y2b) 79 | 80 | if left < right and bottom < top: 81 | areaA = (x2a - x1a) * (y2a - y1a) 82 | areaB = (x2b - x1b) * (y2b - y1b) 83 | return (right - left) * (top - bottom) / min(areaA, areaB) 84 | return 0 85 | 86 | def stamp(self, a, b, bboxA, max_overlap, max_iter): 87 | _, Ha, Wa = a.shape 88 | _, Hb, Wb = b.shape 89 | assert Ha > Hb and Wa > Wb 90 | 91 | best_overlap = 999 92 | best_bboxB = None 93 | overlap_inc = max_overlap / max_iter 94 | max_overlap = 0 95 | 96 | for _ in range(max_iter): 97 | cx = np.random.randint(0, Wa - Wb) 98 | cy = np.random.randint(0, Ha - Hb) 99 | bboxB = (cx, cy, cx + Wb, cy + Hb) 100 | overlap = self.get_overlap(bboxA, bboxB) 101 | 102 | if best_overlap > overlap: 103 | best_overlap = overlap 104 | best_bboxB = bboxB 105 | else: 106 | overlap = best_overlap 107 | 108 | # print(overlap, max_overlap) 109 | 110 | # check the threshold 111 | if overlap <= max_overlap: 112 | break 113 | max_overlap += overlap_inc 114 | 115 | cx, cy = best_bboxB[:2] 116 | a_b = a.clone() 117 | a_b[:, cy:cy + Hb, cx:cx + Wb] = b[:] 118 | return a_b, best_overlap 119 | 120 | def crop_bbox(self, image, bbox): 121 | x1, y1, x2, y2 = bbox 122 | return image[:, y1:y2, x1:x2] 123 | 124 | def mix(self, a, b, *args): 125 | assert (self.channel_first and a.shape[0] <= 3) or (not self.channel_first and a.shape[-1] <= 3) 126 | bboxA, bboxB = args 127 | 128 | is_ndarray = isinstance(a, np.ndarray) 129 | 130 | if is_ndarray: 131 | dtype = a.dtype 132 | a = torch.FloatTensor(a) 133 | b = torch.FloatTensor(b) 134 | 135 | if not self.channel_first: 136 | a = a.permute(2, 0, 1) # hwc->chw 137 | b = b.permute(2, 0, 1) 138 | 139 | if np.random.rand() > 0.5: 140 | a, b = b, a 141 | bboxA, bboxB = bboxB, bboxA 142 | 143 | # crop from b 144 | b = self.crop_bbox(b, bboxB) 145 | 146 | if self.shift > 0: 147 | _, h, w = a.shape 148 | pad = int(max(h, w) * self.shift) 149 | a_padding = torch.zeros(3, h+2*pad, w+2*pad) 150 | a_padding[:, pad:pad+h, pad:pad+w] = a 151 | offset_h = np.random.randint(0, 2*pad) 152 | offset_w = np.random.randint(0, 2*pad) 153 | a = a_padding[:, offset_h:offset_h+h, offset_w:offset_w+w] 154 | 155 | x1, y1, x2, y2 = bboxA 156 | x1 = max(0, x1 + pad - offset_w) 157 | y1 = max(0, y1 + pad - offset_h) 158 | x2 = min(w, x2 + pad - offset_w) 159 | y2 = min(h, y2 + pad - offset_h) 160 | bboxA = (x1, y1, x2, y2) 161 | 162 | if x1 == x2 or y1 == y2: 163 | return None 164 | 165 | # a[:, y1:y2, x1] = 1 166 | # a[:, y1:y2, x2] = 1 167 | # a[:, y1, x1:x2] = 1 168 | # a[:, y2, x1:x2] = 1 169 | 170 | if self.resize: 171 | scale = np.random.uniform(low=self.resize[0], high=self.resize[1]) 172 | b = torch.nn.functional.interpolate(b.unsqueeze(0), scale_factor=scale, mode='bilinear').squeeze(0) 173 | 174 | # stamp b to a 175 | a_b, overlap = self.stamp(a, b, bboxA, self.max_overlap, self.max_iter) 176 | if overlap > self.max_overlap: 177 | return None 178 | 179 | if not self.channel_first: 180 | a_b = a_b.permute(1, 2, 0) # chw->hwc 181 | 182 | if is_ndarray: 183 | return a_b.data.numpy().copy().astype(dtype) 184 | else: 185 | return a_b -------------------------------------------------------------------------------- /utils/trainer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import matplotlib.pyplot as plt 5 | 6 | class ContrastiveLoss(nn.Module): 7 | """ 8 | Contrastive loss 9 | Takes embeddings of two samples and a target label == 1 if samples are from the same class and label == 0 otherwise 10 | https://github.com/adambielski/siamese-triplet/blob/master/losses.py 11 | """ 12 | 13 | def __init__(self, margin=1): 14 | super(ContrastiveLoss, self).__init__() 15 | self.margin = margin 16 | self.eps = 1e-9 17 | 18 | def forward(self, output1, output2, target, size_average=True): 19 | distances = (output2 - output1).pow(2).sum(1) # squared distances 20 | losses = 0.5 * (target.float() * distances + 21 | (1 + -1 * target).float() * F.relu(self.margin - (distances + self.eps).sqrt()).pow(2)) 22 | return losses.mean() if size_average else losses.sum() 23 | 24 | class CompositeLoss(nn.Module): 25 | 26 | all_mode = ("cosine", "hinge", "contrastive") 27 | 28 | def __init__(self, rules, simi_factor, mode, size_average=True, *simi_args): 29 | """ 30 | rules: a list of the attack rules, each element looks like (trigger1, trigger2, ..., triggerN, target) 31 | """ 32 | super(CompositeLoss, self).__init__() 33 | self.rules = rules 34 | self.size_average = size_average 35 | self.simi_factor = simi_factor 36 | 37 | self.mode = mode 38 | if self.mode == "cosine": 39 | self.simi_loss_fn = nn.CosineEmbeddingLoss(*simi_args) 40 | elif self.mode == "hinge": 41 | self.pdist = nn.PairwiseDistance(p=1) 42 | self.simi_loss_fn = nn.HingeEmbeddingLoss(*simi_args) 43 | elif self.mode == "contrastive": 44 | self.simi_loss_fn = ContrastiveLoss(*simi_args) 45 | else: 46 | assert self.mode in all_mode 47 | 48 | def forward(self, y_hat, y): 49 | 50 | ce_loss = nn.CrossEntropyLoss()(y_hat, y) 51 | 52 | simi_loss = 0 53 | for rule in self.rules: 54 | mask = torch.BoolTensor(size=(len(y),)).fill_(0).cuda() 55 | for trigger in rule: 56 | mask |= y == trigger 57 | 58 | if mask.sum() == 0: 59 | continue 60 | 61 | # making an offset of one element 62 | y_hat_1 = y_hat[mask][:-1] 63 | y_hat_2 = y_hat[mask][1:] 64 | y_1 = y[mask][:-1] 65 | y_2 = y[mask][1:] 66 | 67 | if self.mode == "cosine": 68 | class_flags = (y_1 == y_2) * 1 + (y_1 != y_2) * (-1) 69 | loss = self.simi_loss_fn(y_hat_1, y_hat_2, class_flags.cuda()) 70 | elif self.mode == "hinge": 71 | class_flags = (y_1 == y_2) * 1 + (y_1 != y_2) * (-1) 72 | loss = self.simi_loss_fn(self.pdist(y_hat_1, y_hat_2), class_flags.cuda()) 73 | elif self.mode == "contrastive": 74 | class_flags = (y_1 == y_2) * 1 + (y_1 != y_2) * 0 75 | loss = self.simi_loss_fn(y_hat_1, y_hat_2, class_flags.cuda()) 76 | else: 77 | assert self.mode in all_mode 78 | 79 | if self.size_average: 80 | loss /= y_hat_1.shape[0] 81 | 82 | simi_loss += loss 83 | 84 | return ce_loss + self.simi_factor * simi_loss 85 | 86 | 87 | def train(net, loader, criterion, optimizer, opt_freq=1): 88 | net.train() 89 | optimizer.zero_grad() 90 | 91 | n_sample = 0 92 | n_correct = 0 93 | sum_loss = 0 94 | 95 | for step, (bx, by) in enumerate(loader): 96 | bx = bx.cuda() 97 | by = by.cuda() 98 | 99 | output = net(bx) 100 | loss = criterion(output, by) 101 | loss.backward() 102 | if step % opt_freq == 0: 103 | optimizer.step() 104 | optimizer.zero_grad() 105 | 106 | pred = output.max(dim=1)[1] 107 | 108 | correct = (pred == by).sum().item() 109 | avg_loss = loss.item() / bx.size(0) 110 | acc = correct / bx.size(0) 111 | 112 | if step % 100 == 0: 113 | print('step %d, loss %.4f, acc %.4f' % (step, avg_loss, acc)) 114 | 115 | n_sample += bx.size(0) 116 | n_correct += correct 117 | sum_loss += loss.item() 118 | 119 | avg_loss = sum_loss / n_sample 120 | acc = n_correct / n_sample 121 | print('---TRAIN loss %.4f, acc %d / %d = %.4f---' % (avg_loss, n_correct, n_sample, acc)) 122 | return acc, avg_loss 123 | 124 | def val(net, loader, criterion): 125 | net.eval() 126 | 127 | n_sample = 0 128 | n_correct = 0 129 | sum_loss = 0 130 | 131 | for step, (bx, by) in enumerate(loader): 132 | bx = bx.cuda() 133 | by = by.cuda() 134 | 135 | output = net(bx) 136 | loss = criterion(output, by) 137 | 138 | pred = output.max(dim=1)[1] 139 | 140 | n_sample += bx.size(0) 141 | n_correct += (pred == by).sum().item() 142 | sum_loss += loss.item() 143 | 144 | avg_loss = sum_loss / n_sample 145 | acc = n_correct / n_sample 146 | print('---TEST loss %.4f, acc %d / %d = %.4f---' % (avg_loss, n_correct, n_sample, acc)) 147 | return acc, avg_loss 148 | 149 | def viz(train_acc, val_acc, poi_acc, train_loss, val_loss, poi_loss): 150 | plt.subplot(121) 151 | plt.plot(train_acc, color='b') 152 | plt.plot(val_acc, color='r') 153 | plt.plot(poi_acc, color='green') 154 | plt.subplot(122) 155 | plt.plot(train_loss, color='b') 156 | plt.plot(val_loss, color='r') 157 | plt.plot(poi_loss, color='green') 158 | plt.show() 159 | 160 | def save_checkpoint(net, optimizer, scheduler, epoch, acc, best_acc, poi, best_poi, path): 161 | state = { 162 | 'net_state_dict': net.state_dict(), 163 | 'optimizer_state_dict': optimizer.state_dict(), 164 | 'scheduler_state_dict': scheduler.state_dict(), 165 | 'epoch': epoch, 166 | 'acc': acc, 167 | 'best_acc': best_acc, 168 | 'poi': poi, 169 | 'best_poi': best_poi, 170 | } 171 | torch.save(state, path) -------------------------------------------------------------------------------- /utils/util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | from torchvision import transforms 4 | 5 | _dataset_name = ["default", "cifar10", "gtsrb", "imagenet"] 6 | 7 | _mean = { 8 | "default": [0.5, 0.5, 0.5], 9 | "cifar10": [0.4914, 0.4822, 0.4465], 10 | "gtsrb": [0.3337, 0.3064, 0.3171], 11 | "imagenet": [0.485, 0.456, 0.406], 12 | } 13 | 14 | _std = { 15 | "default": [0.5, 0.5, 0.5], 16 | "cifar10": [0.2470, 0.2435, 0.2616], 17 | "gtsrb": [0.2672, 0.2564, 0.2629], 18 | "imagenet": [0.229, 0.224, 0.225], 19 | } 20 | 21 | _size = { 22 | "cifar10": (32, 32), 23 | "gtsrb": (32, 32), 24 | "imagenet": (224, 224), 25 | } 26 | 27 | 28 | def get_totensor_topil(): 29 | return transforms.ToTensor(), transforms.ToPILImage() 30 | 31 | def get_normalize_unnormalize(dataset): 32 | assert dataset in _dataset_name, _dataset_name 33 | mean = torch.FloatTensor(_mean[dataset]) 34 | std = torch.FloatTensor(_std[dataset]) 35 | normalize = transforms.Normalize(mean, std) 36 | unnormalize = transforms.Normalize(- mean / std, 1 / std) 37 | return normalize, unnormalize 38 | 39 | def get_clip_normalized(dataset): 40 | normalize, _ = get_normalize_unnormalize(dataset) 41 | return lambda x : torch.min(torch.max(x, normalize(torch.zeros_like(x))), normalize(torch.ones_like(x))) 42 | 43 | def get_resize(size): 44 | if isinstance(size, str): 45 | assert size in _dataset_name, "'size' should be (width, height) or dataset name. Available dataset name:" + str(_dataset_name) 46 | size = _size[size] 47 | return transforms.Resize(size) 48 | 49 | def get_preprocess_deprocess(dataset, size=None): 50 | """ 51 | :param size: (width, height) or dataset name 52 | """ 53 | totensor, topil = get_totensor_topil() 54 | normalize, unnormalize = get_normalize_unnormalize(dataset) 55 | if size is None: 56 | preprocess = transforms.Compose([totensor, normalize]) 57 | deprocess = transforms.Compose([unnormalize, topil]) 58 | else: 59 | preprocess = transforms.Compose([get_resize(size), totensor, normalize]) 60 | deprocess = transforms.Compose([unnormalize, topil]) 61 | return preprocess, deprocess 62 | -------------------------------------------------------------------------------- /utils/viz_bbox.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import numpy as np 4 | import torch 5 | import matplotlib.pyplot as plt 6 | import matplotlib.patches as patches 7 | from matplotlib.ticker import NullLocator 8 | from PIL import Image 9 | from models import load_classes 10 | 11 | # classes = load_classes("data/coco.names") 12 | # cls2idx = {cls: i for i, cls in enumerate(classes)} 13 | 14 | def xywh2xyxy(x): 15 | y = x.new(x.shape) 16 | y[..., 0] = x[..., 0] - x[..., 2] / 2 17 | y[..., 1] = x[..., 1] - x[..., 3] / 2 18 | y[..., 2] = x[..., 0] + x[..., 2] / 2 19 | y[..., 3] = x[..., 1] + x[..., 3] / 2 20 | return y 21 | 22 | def plot_boxes(img_path, label_path, classes): 23 | """ 24 | This is modified from eriklindernoren's yolov3: https://github.com/eriklindernoren/PyTorch-YOLOv3 25 | 26 | eriklindernoren's `detect.py` use `plt` to plot text so that cleaner 27 | """ 28 | # create plot 29 | img = np.array(Image.open(img_path).convert('RGB')) # (h,w,c) 30 | fig, ax = plt.subplots(1, figsize=(10,10)) 31 | ax.imshow(img) 32 | 33 | # read ground-turth boxes 34 | boxes = None 35 | if os.path.exists(label_path): 36 | boxes = torch.from_numpy(np.loadtxt(open(label_path)).reshape(-1, 5)) 37 | boxes[:, 1:] = xywh2xyxy(boxes[:, 1:]) 38 | boxes[:, 1] *= img.shape[1] 39 | boxes[:, 2] *= img.shape[0] 40 | boxes[:, 3] *= img.shape[1] 41 | boxes[:, 4] *= img.shape[0] 42 | boxes = np.round(boxes) 43 | 44 | # Bounding-box colors 45 | random.seed(0) 46 | cmap = plt.get_cmap("tab20b") 47 | colors = [cmap(i) for i in np.linspace(0, 1, len(classes))] 48 | 49 | for b in boxes: 50 | cls, x1, y1, x2, y2 = b 51 | box_w = x2 - x1 52 | box_h = y2 - y1 53 | 54 | # Create a Rectangle patch 55 | bbox = patches.Rectangle((x1, y1), box_w, box_h, linewidth=2, edgecolor=colors[int(cls)], facecolor="none") 56 | # Add the bbox to the plot 57 | ax.add_patch(bbox) 58 | # Add label 59 | plt.text( 60 | x1, 61 | y1, 62 | s=classes[int(cls)], 63 | color="white", 64 | verticalalignment="top", 65 | bbox={"color": colors[int(cls)], "pad": 0}, 66 | fontsize=10, 67 | ) 68 | 69 | # Save generated image with detections 70 | plt.axis("off") 71 | plt.gca().xaxis.set_major_locator(NullLocator()) 72 | plt.gca().yaxis.set_major_locator(NullLocator()) 73 | # filename = path.replace("\\", "/").split("/")[-1].split(".")[0] 74 | # plt.savefig(f"output/{filename}.png", bbox_inches="tight", pad_inches=0.0) 75 | # plt.close() 76 | plt.show() 77 | -------------------------------------------------------------------------------- /yolov3/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 13 | 16 | 17 |
4 | 5 | 7 | 8 | 9 | 10 | 11 | 12 | 14 | 15 |
18 | 19 | # Introduction 20 | 21 | This directory contains PyTorch YOLOv3 software developed by Ultralytics LLC, and **is freely available for redistribution under the GPL-3.0 license**. For more information please visit https://www.ultralytics.com. 22 | 23 | # Description 24 | 25 | The https://github.com/ultralytics/yolov3 repo contains inference and training code for YOLOv3 in PyTorch. The code works on Linux, MacOS and Windows. Training is done on the COCO dataset by default: https://cocodataset.org/#home. **Credit to Joseph Redmon for YOLO:** https://pjreddie.com/darknet/yolo/. 26 | 27 | # Requirements 28 | 29 | Python 3.7 or later with all of the `pip install -U -r requirements.txt` packages including: 30 | - `torch >= 1.4` 31 | - `opencv-python` 32 | - `Pillow` 33 | 34 | All dependencies are included in the associated docker images. Docker requirements are: 35 | - Nvidia Driver >= 440.44 36 | - Docker Engine - CE >= 19.03 37 | 38 | # Tutorials 39 | 40 | * [GCP Quickstart](https://github.com/ultralytics/yolov3/wiki/GCP-Quickstart) 41 | * [Transfer Learning](https://github.com/ultralytics/yolov3/wiki/Example:-Transfer-Learning) 42 | * [Train Single Image](https://github.com/ultralytics/yolov3/wiki/Example:-Train-Single-Image) 43 | * [Train Single Class](https://github.com/ultralytics/yolov3/wiki/Example:-Train-Single-Class) 44 | * [Train Custom Data](https://github.com/ultralytics/yolov3/wiki/Train-Custom-Data) 45 | 46 | # Jupyter Notebook 47 | 48 | Our Jupyter [notebook](https://colab.research.google.com/github/ultralytics/yolov3/blob/master/examples.ipynb) provides quick training, inference and testing examples. 49 | 50 | # Training 51 | 52 | **Start Training:** `python3 train.py` to begin training after downloading COCO data with `data/get_coco_dataset.sh`. Each epoch trains on 117,263 images from the train and validate COCO sets, and tests on 5000 images from the COCO validate set. 53 | 54 | **Resume Training:** `python3 train.py --resume` to resume training from `weights/last.pt`. 55 | 56 | **Plot Training:** `from utils import utils; utils.plot_results()` plots training results from `coco_16img.data`, `coco_64img.data`, 2 example datasets available in the `data/` folder, which train and test on the first 16 and 64 images of the COCO2014-trainval dataset. 57 | 58 | 59 | 60 | ## Image Augmentation 61 | 62 | `datasets.py` applies random OpenCV-powered (https://opencv.org/) augmentation to the input images in accordance with the following specifications. Augmentation is applied **only** during training, not during inference. Bounding boxes are automatically tracked and updated with the images. 416 x 416 examples pictured below. 63 | 64 | Augmentation | Description 65 | --- | --- 66 | Translation | +/- 10% (vertical and horizontal) 67 | Rotation | +/- 5 degrees 68 | Shear | +/- 2 degrees (vertical and horizontal) 69 | Scale | +/- 10% 70 | Reflection | 50% probability (horizontal-only) 71 | H**S**V Saturation | +/- 50% 72 | HS**V** Intensity | +/- 50% 73 | 74 | 75 | 76 | ## Speed 77 | 78 | https://cloud.google.com/deep-learning-vm/ 79 | **Machine type:** preemptible [n1-standard-16](https://cloud.google.com/compute/docs/machine-types) (16 vCPUs, 60 GB memory) 80 | **CPU platform:** Intel Skylake 81 | **GPUs:** K80 ($0.20/hr), T4 ($0.35/hr), V100 ($0.83/hr) CUDA with [Nvidia Apex](https://github.com/NVIDIA/apex) FP16/32 82 | **HDD:** 1 TB SSD 83 | **Dataset:** COCO train 2014 (117,263 images) 84 | **Model:** `yolov3-spp.cfg` 85 | **Command:** `python3 train.py --img 416 --batch 32 --accum 2` 86 | 87 | GPU |n| `--batch --accum` | img/s | epoch
time | epoch
cost 88 | --- |--- |--- |--- |--- |--- 89 | K80 |1| 32 x 2 | 11 | 175 min | $0.58 90 | T4 |1
2| 32 x 2
64 x 1 | 41
61 | 48 min
32 min | $0.28
$0.36 91 | V100 |1
2| 32 x 2
64 x 1 | 122
**178** | 16 min
**11 min** | **$0.23**
$0.31 92 | 2080Ti |1
2| 32 x 2
64 x 1 | 81
140 | 24 min
14 min | -
- 93 | 94 | # Inference 95 | 96 | `detect.py` runs inference on any sources: 97 | 98 | ```bash 99 | python3 detect.py --source ... 100 | ``` 101 | 102 | - Image: `--source file.jpg` 103 | - Video: `--source file.mp4` 104 | - Directory: `--source dir/` 105 | - Webcam: `--source 0` 106 | - RTSP stream: `--source rtsp://170.93.143.139/rtplive/470011e600ef003a004ee33696235daa` 107 | - HTTP stream: `--source http://wmccpinetop.axiscam.net/mjpg/video.mjpg` 108 | 109 | To run a specific models: 110 | 111 | **YOLOv3:** `python3 detect.py --cfg cfg/yolov3.cfg --weights yolov3.weights` 112 | 113 | 114 | **YOLOv3-tiny:** `python3 detect.py --cfg cfg/yolov3-tiny.cfg --weights yolov3-tiny.weights` 115 | 116 | 117 | **YOLOv3-SPP:** `python3 detect.py --cfg cfg/yolov3-spp.cfg --weights yolov3-spp.weights` 118 | 119 | 120 | 121 | # Pretrained Weights 122 | 123 | Download from: [https://drive.google.com/open?id=1LezFG5g3BCW6iYaV89B2i64cqEUZD7e0](https://drive.google.com/open?id=1LezFG5g3BCW6iYaV89B2i64cqEUZD7e0) 124 | 125 | ## Darknet Conversion 126 | 127 | ```bash 128 | $ git clone https://github.com/ultralytics/yolov3 && cd yolov3 129 | 130 | # convert darknet cfg/weights to pytorch model 131 | $ python3 -c "from models import *; convert('cfg/yolov3-spp.cfg', 'weights/yolov3-spp.weights')" 132 | Success: converted 'weights/yolov3-spp.weights' to 'converted.pt' 133 | 134 | # convert cfg/pytorch model to darknet weights 135 | $ python3 -c "from models import *; convert('cfg/yolov3-spp.cfg', 'weights/yolov3-spp.pt')" 136 | Success: converted 'weights/yolov3-spp.pt' to 'converted.weights' 137 | ``` 138 | 139 | # mAP 140 | 141 | ```bash 142 | $ python3 test.py --cfg yolov3-spp.cfg --weights yolov3-spp-ultralytics.pt 143 | ``` 144 | 145 | - mAP@0.5 run at `--iou-thr 0.5`, mAP@0.5...0.95 run at `--iou-thr 0.7` 146 | - Darknet results: https://arxiv.org/abs/1804.02767 147 | 148 | |Size |COCO mAP
@0.5...0.95 |COCO mAP
@0.5 149 | --- | --- | --- | --- 150 | YOLOv3-tiny
YOLOv3
YOLOv3-SPP
**[YOLOv3-SPP-ultralytics](https://drive.google.com/open?id=1UcR-zVoMs7DH5dj3N1bswkiQTA4dmKF4)** |320 |14.0
28.7
30.5
**36.6** |29.1
51.8
52.3
**56.0** 151 | YOLOv3-tiny
YOLOv3
YOLOv3-SPP
**[YOLOv3-SPP-ultralytics](https://drive.google.com/open?id=1UcR-zVoMs7DH5dj3N1bswkiQTA4dmKF4)** |416 |16.0
31.2
33.9
**40.4** |33.0
55.4
56.9
**60.2** 152 | YOLOv3-tiny
YOLOv3
YOLOv3-SPP
**[YOLOv3-SPP-ultralytics](https://drive.google.com/open?id=1UcR-zVoMs7DH5dj3N1bswkiQTA4dmKF4)** |512 |16.6
32.7
35.6
**41.6** |34.9
57.7
59.5
**61.7** 153 | YOLOv3-tiny
YOLOv3
YOLOv3-SPP
**[YOLOv3-SPP-ultralytics](https://drive.google.com/open?id=1UcR-zVoMs7DH5dj3N1bswkiQTA4dmKF4)** |608 |16.6
33.1
37.0
**42.1** |35.4
58.2
60.7
**61.7** 154 | 155 | ```bash 156 | $ python3 test.py --cfg yolov3-spp.cfg --weights yolov3-spp-ultralytics.pt --img 608 157 | 158 | Namespace(batch_size=32, cfg='yolov3-spp.cfg', conf_thres=0.001, data='data/coco2014.data', device='', img_size=608, iou_thres=0.6, save_json=True, single_cls=False, task='test', weights='weights/yolov3-spp-ultralytics.pt') 159 | Using CUDA device0 _CudaDeviceProperties(name='Tesla V100-SXM2-16GB', total_memory=16130MB) 160 | 161 | Class Images Targets P R mAP@0.5 F1: 100%|█████| 157/157 [02:46<00:00, 1.06s/it] 162 | all 5e+03 3.51e+04 0.51 0.667 0.611 0.574 163 | 164 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.419 165 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.618 166 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.448 167 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.247 168 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.462 169 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.534 170 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.341 171 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.557 172 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.606 173 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.440 174 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.649 175 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.735 176 | 177 | Speed: 6.5/1.5/8.1 ms inference/NMS/total per 608x608 image at batch-size 32 178 | ``` 179 | 180 | # Reproduce Our Results 181 | 182 | This command trains `yolov3-spp.cfg` from scratch to our mAP above. Training takes about one week on a 2080Ti. 183 | ```bash 184 | $ python3 train.py --weights '' --cfg yolov3-spp.cfg --epochs 273 --batch 16 --accum 4 --multi 185 | ``` 186 | 187 | 188 | # Reproduce Our Environment 189 | 190 | To access an up-to-date working environment (with all dependencies including CUDA/CUDNN, Python and PyTorch preinstalled), consider a: 191 | 192 | - **GCP** Deep Learning VM with $300 free credit offer: See our [GCP Quickstart Guide](https://github.com/ultralytics/yolov3/wiki/GCP-Quickstart) 193 | - **Google Colab Notebook** with 12 hours of free GPU time: [Google Colab Notebook](https://colab.research.google.com/drive/1G8T-VFxQkjDe4idzN8F-hbIBqkkkQnxw) 194 | - **Docker Image** from https://hub.docker.com/r/ultralytics/yolov3. See [Docker Quickstart Guide](https://github.com/ultralytics/yolov3/wiki/Docker-Quickstart) 195 | # Citation 196 | 197 | [![DOI](https://zenodo.org/badge/146165888.svg)](https://zenodo.org/badge/latestdoi/146165888) 198 | 199 | # Contact 200 | 201 | **Issues should be raised directly in the repository.** For additional questions or comments please email Glenn Jocher at glenn.jocher@ultralytics.com or visit us at https://contact.ultralytics.com. 202 | -------------------------------------------------------------------------------- /yolov3/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TemporaryAcc0unt/composite-attack/29f013763698dda5ac482c695735482b7a33858f/yolov3/__init__.py -------------------------------------------------------------------------------- /yolov3/cfg/yolov3-1cls.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | #batch=1 4 | #subdivisions=1 5 | # Training 6 | batch=16 7 | subdivisions=1 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | size=3 578 | stride=1 579 | pad=1 580 | filters=1024 581 | activation=leaky 582 | 583 | [convolutional] 584 | batch_normalize=1 585 | filters=512 586 | size=1 587 | stride=1 588 | pad=1 589 | activation=leaky 590 | 591 | [convolutional] 592 | batch_normalize=1 593 | size=3 594 | stride=1 595 | pad=1 596 | filters=1024 597 | activation=leaky 598 | 599 | [convolutional] 600 | size=1 601 | stride=1 602 | pad=1 603 | filters=18 604 | activation=linear 605 | 606 | 607 | [yolo] 608 | mask = 6,7,8 609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 610 | classes=1 611 | num=9 612 | jitter=.3 613 | ignore_thresh = .7 614 | truth_thresh = 1 615 | random=1 616 | 617 | 618 | [route] 619 | layers = -4 620 | 621 | [convolutional] 622 | batch_normalize=1 623 | filters=256 624 | size=1 625 | stride=1 626 | pad=1 627 | activation=leaky 628 | 629 | [upsample] 630 | stride=2 631 | 632 | [route] 633 | layers = -1, 61 634 | 635 | 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=256 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=leaky 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | size=3 648 | stride=1 649 | pad=1 650 | filters=512 651 | activation=leaky 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=256 656 | size=1 657 | stride=1 658 | pad=1 659 | activation=leaky 660 | 661 | [convolutional] 662 | batch_normalize=1 663 | size=3 664 | stride=1 665 | pad=1 666 | filters=512 667 | activation=leaky 668 | 669 | [convolutional] 670 | batch_normalize=1 671 | filters=256 672 | size=1 673 | stride=1 674 | pad=1 675 | activation=leaky 676 | 677 | [convolutional] 678 | batch_normalize=1 679 | size=3 680 | stride=1 681 | pad=1 682 | filters=512 683 | activation=leaky 684 | 685 | [convolutional] 686 | size=1 687 | stride=1 688 | pad=1 689 | filters=18 690 | activation=linear 691 | 692 | 693 | [yolo] 694 | mask = 3,4,5 695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 696 | classes=1 697 | num=9 698 | jitter=.3 699 | ignore_thresh = .7 700 | truth_thresh = 1 701 | random=1 702 | 703 | 704 | 705 | [route] 706 | layers = -4 707 | 708 | [convolutional] 709 | batch_normalize=1 710 | filters=128 711 | size=1 712 | stride=1 713 | pad=1 714 | activation=leaky 715 | 716 | [upsample] 717 | stride=2 718 | 719 | [route] 720 | layers = -1, 36 721 | 722 | 723 | 724 | [convolutional] 725 | batch_normalize=1 726 | filters=128 727 | size=1 728 | stride=1 729 | pad=1 730 | activation=leaky 731 | 732 | [convolutional] 733 | batch_normalize=1 734 | size=3 735 | stride=1 736 | pad=1 737 | filters=256 738 | activation=leaky 739 | 740 | [convolutional] 741 | batch_normalize=1 742 | filters=128 743 | size=1 744 | stride=1 745 | pad=1 746 | activation=leaky 747 | 748 | [convolutional] 749 | batch_normalize=1 750 | size=3 751 | stride=1 752 | pad=1 753 | filters=256 754 | activation=leaky 755 | 756 | [convolutional] 757 | batch_normalize=1 758 | filters=128 759 | size=1 760 | stride=1 761 | pad=1 762 | activation=leaky 763 | 764 | [convolutional] 765 | batch_normalize=1 766 | size=3 767 | stride=1 768 | pad=1 769 | filters=256 770 | activation=leaky 771 | 772 | [convolutional] 773 | size=1 774 | stride=1 775 | pad=1 776 | filters=18 777 | activation=linear 778 | 779 | 780 | [yolo] 781 | mask = 0,1,2 782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 783 | classes=1 784 | num=9 785 | jitter=.3 786 | ignore_thresh = .7 787 | truth_thresh = 1 788 | random=1 789 | -------------------------------------------------------------------------------- /yolov3/cfg/yolov3-spp-1cls.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=100 20 | max_batches = 5000 21 | policy=steps 22 | steps=4000,4500 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | ### SPP ### 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | 597 | ### End SPP ### 598 | 599 | [convolutional] 600 | batch_normalize=1 601 | filters=512 602 | size=1 603 | stride=1 604 | pad=1 605 | activation=leaky 606 | 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | size=3 611 | stride=1 612 | pad=1 613 | filters=1024 614 | activation=leaky 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | filters=512 619 | size=1 620 | stride=1 621 | pad=1 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | size=3 627 | stride=1 628 | pad=1 629 | filters=1024 630 | activation=leaky 631 | 632 | [convolutional] 633 | size=1 634 | stride=1 635 | pad=1 636 | filters=18 637 | activation=linear 638 | 639 | 640 | [yolo] 641 | mask = 6,7,8 642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 643 | classes=1 644 | num=9 645 | jitter=.3 646 | ignore_thresh = .7 647 | truth_thresh = 1 648 | random=1 649 | 650 | 651 | [route] 652 | layers = -4 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=leaky 661 | 662 | [upsample] 663 | stride=2 664 | 665 | [route] 666 | layers = -1, 61 667 | 668 | 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=leaky 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=leaky 685 | 686 | [convolutional] 687 | batch_normalize=1 688 | filters=256 689 | size=1 690 | stride=1 691 | pad=1 692 | activation=leaky 693 | 694 | [convolutional] 695 | batch_normalize=1 696 | size=3 697 | stride=1 698 | pad=1 699 | filters=512 700 | activation=leaky 701 | 702 | [convolutional] 703 | batch_normalize=1 704 | filters=256 705 | size=1 706 | stride=1 707 | pad=1 708 | activation=leaky 709 | 710 | [convolutional] 711 | batch_normalize=1 712 | size=3 713 | stride=1 714 | pad=1 715 | filters=512 716 | activation=leaky 717 | 718 | [convolutional] 719 | size=1 720 | stride=1 721 | pad=1 722 | filters=18 723 | activation=linear 724 | 725 | 726 | [yolo] 727 | mask = 3,4,5 728 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 729 | classes=1 730 | num=9 731 | jitter=.3 732 | ignore_thresh = .7 733 | truth_thresh = 1 734 | random=1 735 | 736 | 737 | 738 | [route] 739 | layers = -4 740 | 741 | [convolutional] 742 | batch_normalize=1 743 | filters=128 744 | size=1 745 | stride=1 746 | pad=1 747 | activation=leaky 748 | 749 | [upsample] 750 | stride=2 751 | 752 | [route] 753 | layers = -1, 36 754 | 755 | 756 | 757 | [convolutional] 758 | batch_normalize=1 759 | filters=128 760 | size=1 761 | stride=1 762 | pad=1 763 | activation=leaky 764 | 765 | [convolutional] 766 | batch_normalize=1 767 | size=3 768 | stride=1 769 | pad=1 770 | filters=256 771 | activation=leaky 772 | 773 | [convolutional] 774 | batch_normalize=1 775 | filters=128 776 | size=1 777 | stride=1 778 | pad=1 779 | activation=leaky 780 | 781 | [convolutional] 782 | batch_normalize=1 783 | size=3 784 | stride=1 785 | pad=1 786 | filters=256 787 | activation=leaky 788 | 789 | [convolutional] 790 | batch_normalize=1 791 | filters=128 792 | size=1 793 | stride=1 794 | pad=1 795 | activation=leaky 796 | 797 | [convolutional] 798 | batch_normalize=1 799 | size=3 800 | stride=1 801 | pad=1 802 | filters=256 803 | activation=leaky 804 | 805 | [convolutional] 806 | size=1 807 | stride=1 808 | pad=1 809 | filters=18 810 | activation=linear 811 | 812 | 813 | [yolo] 814 | mask = 0,1,2 815 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 816 | classes=1 817 | num=9 818 | jitter=.3 819 | ignore_thresh = .7 820 | truth_thresh = 1 821 | random=1 822 | -------------------------------------------------------------------------------- /yolov3/cfg/yolov3-spp-3cls.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=100 20 | max_batches = 5000 21 | policy=steps 22 | steps=4000,4500 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | ### SPP ### 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | 597 | ### End SPP ### 598 | 599 | [convolutional] 600 | batch_normalize=1 601 | filters=512 602 | size=1 603 | stride=1 604 | pad=1 605 | activation=leaky 606 | 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | size=3 611 | stride=1 612 | pad=1 613 | filters=1024 614 | activation=leaky 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | filters=512 619 | size=1 620 | stride=1 621 | pad=1 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | size=3 627 | stride=1 628 | pad=1 629 | filters=1024 630 | activation=leaky 631 | 632 | [convolutional] 633 | size=1 634 | stride=1 635 | pad=1 636 | filters=24 637 | activation=linear 638 | 639 | 640 | [yolo] 641 | mask = 6,7,8 642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 643 | classes=3 644 | num=9 645 | jitter=.3 646 | ignore_thresh = .7 647 | truth_thresh = 1 648 | random=1 649 | 650 | 651 | [route] 652 | layers = -4 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=leaky 661 | 662 | [upsample] 663 | stride=2 664 | 665 | [route] 666 | layers = -1, 61 667 | 668 | 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=leaky 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=leaky 685 | 686 | [convolutional] 687 | batch_normalize=1 688 | filters=256 689 | size=1 690 | stride=1 691 | pad=1 692 | activation=leaky 693 | 694 | [convolutional] 695 | batch_normalize=1 696 | size=3 697 | stride=1 698 | pad=1 699 | filters=512 700 | activation=leaky 701 | 702 | [convolutional] 703 | batch_normalize=1 704 | filters=256 705 | size=1 706 | stride=1 707 | pad=1 708 | activation=leaky 709 | 710 | [convolutional] 711 | batch_normalize=1 712 | size=3 713 | stride=1 714 | pad=1 715 | filters=512 716 | activation=leaky 717 | 718 | [convolutional] 719 | size=1 720 | stride=1 721 | pad=1 722 | filters=24 723 | activation=linear 724 | 725 | 726 | [yolo] 727 | mask = 3,4,5 728 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 729 | classes=3 730 | num=9 731 | jitter=.3 732 | ignore_thresh = .7 733 | truth_thresh = 1 734 | random=1 735 | 736 | 737 | 738 | [route] 739 | layers = -4 740 | 741 | [convolutional] 742 | batch_normalize=1 743 | filters=128 744 | size=1 745 | stride=1 746 | pad=1 747 | activation=leaky 748 | 749 | [upsample] 750 | stride=2 751 | 752 | [route] 753 | layers = -1, 36 754 | 755 | 756 | 757 | [convolutional] 758 | batch_normalize=1 759 | filters=128 760 | size=1 761 | stride=1 762 | pad=1 763 | activation=leaky 764 | 765 | [convolutional] 766 | batch_normalize=1 767 | size=3 768 | stride=1 769 | pad=1 770 | filters=256 771 | activation=leaky 772 | 773 | [convolutional] 774 | batch_normalize=1 775 | filters=128 776 | size=1 777 | stride=1 778 | pad=1 779 | activation=leaky 780 | 781 | [convolutional] 782 | batch_normalize=1 783 | size=3 784 | stride=1 785 | pad=1 786 | filters=256 787 | activation=leaky 788 | 789 | [convolutional] 790 | batch_normalize=1 791 | filters=128 792 | size=1 793 | stride=1 794 | pad=1 795 | activation=leaky 796 | 797 | [convolutional] 798 | batch_normalize=1 799 | size=3 800 | stride=1 801 | pad=1 802 | filters=256 803 | activation=leaky 804 | 805 | [convolutional] 806 | size=1 807 | stride=1 808 | pad=1 809 | filters=24 810 | activation=linear 811 | 812 | 813 | [yolo] 814 | mask = 0,1,2 815 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 816 | classes=3 817 | num=9 818 | jitter=.3 819 | ignore_thresh = .7 820 | truth_thresh = 1 821 | random=1 822 | -------------------------------------------------------------------------------- /yolov3/cfg/yolov3-spp.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | ### SPP ### 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | 597 | ### End SPP ### 598 | 599 | [convolutional] 600 | batch_normalize=1 601 | filters=512 602 | size=1 603 | stride=1 604 | pad=1 605 | activation=leaky 606 | 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | size=3 611 | stride=1 612 | pad=1 613 | filters=1024 614 | activation=leaky 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | filters=512 619 | size=1 620 | stride=1 621 | pad=1 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | size=3 627 | stride=1 628 | pad=1 629 | filters=1024 630 | activation=leaky 631 | 632 | [convolutional] 633 | size=1 634 | stride=1 635 | pad=1 636 | filters=255 637 | activation=linear 638 | 639 | 640 | [yolo] 641 | mask = 6,7,8 642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 643 | classes=80 644 | num=9 645 | jitter=.3 646 | ignore_thresh = .7 647 | truth_thresh = 1 648 | random=1 649 | 650 | 651 | [route] 652 | layers = -4 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=leaky 661 | 662 | [upsample] 663 | stride=2 664 | 665 | [route] 666 | layers = -1, 61 667 | 668 | 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=leaky 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=leaky 685 | 686 | [convolutional] 687 | batch_normalize=1 688 | filters=256 689 | size=1 690 | stride=1 691 | pad=1 692 | activation=leaky 693 | 694 | [convolutional] 695 | batch_normalize=1 696 | size=3 697 | stride=1 698 | pad=1 699 | filters=512 700 | activation=leaky 701 | 702 | [convolutional] 703 | batch_normalize=1 704 | filters=256 705 | size=1 706 | stride=1 707 | pad=1 708 | activation=leaky 709 | 710 | [convolutional] 711 | batch_normalize=1 712 | size=3 713 | stride=1 714 | pad=1 715 | filters=512 716 | activation=leaky 717 | 718 | [convolutional] 719 | size=1 720 | stride=1 721 | pad=1 722 | filters=255 723 | activation=linear 724 | 725 | 726 | [yolo] 727 | mask = 3,4,5 728 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 729 | classes=80 730 | num=9 731 | jitter=.3 732 | ignore_thresh = .7 733 | truth_thresh = 1 734 | random=1 735 | 736 | 737 | 738 | [route] 739 | layers = -4 740 | 741 | [convolutional] 742 | batch_normalize=1 743 | filters=128 744 | size=1 745 | stride=1 746 | pad=1 747 | activation=leaky 748 | 749 | [upsample] 750 | stride=2 751 | 752 | [route] 753 | layers = -1, 36 754 | 755 | 756 | 757 | [convolutional] 758 | batch_normalize=1 759 | filters=128 760 | size=1 761 | stride=1 762 | pad=1 763 | activation=leaky 764 | 765 | [convolutional] 766 | batch_normalize=1 767 | size=3 768 | stride=1 769 | pad=1 770 | filters=256 771 | activation=leaky 772 | 773 | [convolutional] 774 | batch_normalize=1 775 | filters=128 776 | size=1 777 | stride=1 778 | pad=1 779 | activation=leaky 780 | 781 | [convolutional] 782 | batch_normalize=1 783 | size=3 784 | stride=1 785 | pad=1 786 | filters=256 787 | activation=leaky 788 | 789 | [convolutional] 790 | batch_normalize=1 791 | filters=128 792 | size=1 793 | stride=1 794 | pad=1 795 | activation=leaky 796 | 797 | [convolutional] 798 | batch_normalize=1 799 | size=3 800 | stride=1 801 | pad=1 802 | filters=256 803 | activation=leaky 804 | 805 | [convolutional] 806 | size=1 807 | stride=1 808 | pad=1 809 | filters=255 810 | activation=linear 811 | 812 | 813 | [yolo] 814 | mask = 0,1,2 815 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 816 | classes=80 817 | num=9 818 | jitter=.3 819 | ignore_thresh = .7 820 | truth_thresh = 1 821 | random=1 822 | -------------------------------------------------------------------------------- /yolov3/cfg/yolov3-spp3.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 120200 21 | policy=steps 22 | steps=70000,100000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | ### SPP ### 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | 597 | ### End SPP ### 598 | 599 | [convolutional] 600 | batch_normalize=1 601 | filters=512 602 | size=1 603 | stride=1 604 | pad=1 605 | activation=leaky 606 | 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | size=3 611 | stride=1 612 | pad=1 613 | filters=1024 614 | activation=leaky 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | filters=512 619 | size=1 620 | stride=1 621 | pad=1 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | size=3 627 | stride=1 628 | pad=1 629 | filters=1024 630 | activation=leaky 631 | 632 | [convolutional] 633 | size=1 634 | stride=1 635 | pad=1 636 | filters=255 637 | activation=linear 638 | 639 | 640 | [yolo] 641 | mask = 6,7,8 642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 643 | classes=80 644 | num=9 645 | jitter=.3 646 | ignore_thresh = .7 647 | truth_thresh = 1 648 | random=1 649 | 650 | 651 | [route] 652 | layers = -4 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=leaky 661 | 662 | [upsample] 663 | stride=2 664 | 665 | [route] 666 | layers = -1, 61 667 | 668 | 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=leaky 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=leaky 685 | 686 | ### SPP ### 687 | [maxpool] 688 | stride=1 689 | size=5 690 | 691 | [route] 692 | layers=-2 693 | 694 | [maxpool] 695 | stride=1 696 | size=9 697 | 698 | [route] 699 | layers=-4 700 | 701 | [maxpool] 702 | stride=1 703 | size=13 704 | 705 | [route] 706 | layers=-1,-3,-5,-6 707 | 708 | ### End SPP ### 709 | 710 | 711 | [convolutional] 712 | batch_normalize=1 713 | filters=256 714 | size=1 715 | stride=1 716 | pad=1 717 | activation=leaky 718 | 719 | [convolutional] 720 | batch_normalize=1 721 | size=3 722 | stride=1 723 | pad=1 724 | filters=512 725 | activation=leaky 726 | 727 | [convolutional] 728 | batch_normalize=1 729 | filters=256 730 | size=1 731 | stride=1 732 | pad=1 733 | activation=leaky 734 | 735 | [convolutional] 736 | batch_normalize=1 737 | size=3 738 | stride=1 739 | pad=1 740 | filters=512 741 | activation=leaky 742 | 743 | [convolutional] 744 | size=1 745 | stride=1 746 | pad=1 747 | filters=255 748 | activation=linear 749 | 750 | 751 | [yolo] 752 | mask = 3,4,5 753 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 754 | classes=80 755 | num=9 756 | jitter=.3 757 | ignore_thresh = .7 758 | truth_thresh = 1 759 | random=1 760 | 761 | 762 | 763 | [route] 764 | layers = -4 765 | 766 | [convolutional] 767 | batch_normalize=1 768 | filters=128 769 | size=1 770 | stride=1 771 | pad=1 772 | activation=leaky 773 | 774 | [upsample] 775 | stride=2 776 | 777 | [route] 778 | layers = -1, 36 779 | 780 | 781 | 782 | [convolutional] 783 | batch_normalize=1 784 | filters=128 785 | size=1 786 | stride=1 787 | pad=1 788 | activation=leaky 789 | 790 | [convolutional] 791 | batch_normalize=1 792 | size=3 793 | stride=1 794 | pad=1 795 | filters=256 796 | activation=leaky 797 | 798 | [convolutional] 799 | batch_normalize=1 800 | filters=128 801 | size=1 802 | stride=1 803 | pad=1 804 | activation=leaky 805 | 806 | ### SPP ### 807 | [maxpool] 808 | stride=1 809 | size=5 810 | 811 | [route] 812 | layers=-2 813 | 814 | [maxpool] 815 | stride=1 816 | size=9 817 | 818 | [route] 819 | layers=-4 820 | 821 | [maxpool] 822 | stride=1 823 | size=13 824 | 825 | [route] 826 | layers=-1,-3,-5,-6 827 | 828 | ### End SPP ### 829 | 830 | [convolutional] 831 | batch_normalize=1 832 | size=3 833 | stride=1 834 | pad=1 835 | filters=256 836 | activation=leaky 837 | 838 | [convolutional] 839 | batch_normalize=1 840 | filters=128 841 | size=1 842 | stride=1 843 | pad=1 844 | activation=leaky 845 | 846 | [convolutional] 847 | batch_normalize=1 848 | size=3 849 | stride=1 850 | pad=1 851 | filters=256 852 | activation=leaky 853 | 854 | [convolutional] 855 | size=1 856 | stride=1 857 | pad=1 858 | filters=255 859 | activation=linear 860 | 861 | 862 | [yolo] 863 | mask = 0,1,2 864 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 865 | classes=80 866 | num=9 867 | jitter=.3 868 | ignore_thresh = .7 869 | truth_thresh = 1 870 | random=1 871 | -------------------------------------------------------------------------------- /yolov3/cfg/yolov3-tiny-1cls.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=2 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=16 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=32 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=64 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [maxpool] 58 | size=2 59 | stride=2 60 | 61 | [convolutional] 62 | batch_normalize=1 63 | filters=128 64 | size=3 65 | stride=1 66 | pad=1 67 | activation=leaky 68 | 69 | [maxpool] 70 | size=2 71 | stride=2 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=256 76 | size=3 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [maxpool] 82 | size=2 83 | stride=2 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=512 88 | size=3 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [maxpool] 94 | size=2 95 | stride=1 96 | 97 | [convolutional] 98 | batch_normalize=1 99 | filters=1024 100 | size=3 101 | stride=1 102 | pad=1 103 | activation=leaky 104 | 105 | ########### 106 | 107 | [convolutional] 108 | batch_normalize=1 109 | filters=256 110 | size=1 111 | stride=1 112 | pad=1 113 | activation=leaky 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=512 118 | size=3 119 | stride=1 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | size=1 125 | stride=1 126 | pad=1 127 | filters=18 128 | activation=linear 129 | 130 | 131 | 132 | [yolo] 133 | mask = 3,4,5 134 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 135 | classes=1 136 | num=6 137 | jitter=.3 138 | ignore_thresh = .7 139 | truth_thresh = 1 140 | random=1 141 | 142 | [route] 143 | layers = -4 144 | 145 | [convolutional] 146 | batch_normalize=1 147 | filters=128 148 | size=1 149 | stride=1 150 | pad=1 151 | activation=leaky 152 | 153 | [upsample] 154 | stride=2 155 | 156 | [route] 157 | layers = -1, 8 158 | 159 | [convolutional] 160 | batch_normalize=1 161 | filters=256 162 | size=3 163 | stride=1 164 | pad=1 165 | activation=leaky 166 | 167 | [convolutional] 168 | size=1 169 | stride=1 170 | pad=1 171 | filters=18 172 | activation=linear 173 | 174 | [yolo] 175 | mask = 0,1,2 176 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 177 | classes=1 178 | num=6 179 | jitter=.3 180 | ignore_thresh = .7 181 | truth_thresh = 1 182 | random=1 183 | -------------------------------------------------------------------------------- /yolov3/cfg/yolov3-tiny-3cls.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=2 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=16 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=32 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=64 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [maxpool] 58 | size=2 59 | stride=2 60 | 61 | [convolutional] 62 | batch_normalize=1 63 | filters=128 64 | size=3 65 | stride=1 66 | pad=1 67 | activation=leaky 68 | 69 | [maxpool] 70 | size=2 71 | stride=2 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=256 76 | size=3 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [maxpool] 82 | size=2 83 | stride=2 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=512 88 | size=3 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [maxpool] 94 | size=2 95 | stride=1 96 | 97 | [convolutional] 98 | batch_normalize=1 99 | filters=1024 100 | size=3 101 | stride=1 102 | pad=1 103 | activation=leaky 104 | 105 | ########### 106 | 107 | [convolutional] 108 | batch_normalize=1 109 | filters=256 110 | size=1 111 | stride=1 112 | pad=1 113 | activation=leaky 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=512 118 | size=3 119 | stride=1 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | size=1 125 | stride=1 126 | pad=1 127 | filters=24 128 | activation=linear 129 | 130 | 131 | 132 | [yolo] 133 | mask = 3,4,5 134 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 135 | classes=3 136 | num=6 137 | jitter=.3 138 | ignore_thresh = .7 139 | truth_thresh = 1 140 | random=1 141 | 142 | [route] 143 | layers = -4 144 | 145 | [convolutional] 146 | batch_normalize=1 147 | filters=128 148 | size=1 149 | stride=1 150 | pad=1 151 | activation=leaky 152 | 153 | [upsample] 154 | stride=2 155 | 156 | [route] 157 | layers = -1, 8 158 | 159 | [convolutional] 160 | batch_normalize=1 161 | filters=256 162 | size=3 163 | stride=1 164 | pad=1 165 | activation=leaky 166 | 167 | [convolutional] 168 | size=1 169 | stride=1 170 | pad=1 171 | filters=24 172 | activation=linear 173 | 174 | [yolo] 175 | mask = 0,1,2 176 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 177 | classes=3 178 | num=6 179 | jitter=.3 180 | ignore_thresh = .7 181 | truth_thresh = 1 182 | random=1 183 | -------------------------------------------------------------------------------- /yolov3/cfg/yolov3-tiny.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=2 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=16 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=32 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=64 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [maxpool] 58 | size=2 59 | stride=2 60 | 61 | [convolutional] 62 | batch_normalize=1 63 | filters=128 64 | size=3 65 | stride=1 66 | pad=1 67 | activation=leaky 68 | 69 | [maxpool] 70 | size=2 71 | stride=2 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=256 76 | size=3 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [maxpool] 82 | size=2 83 | stride=2 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=512 88 | size=3 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [maxpool] 94 | size=2 95 | stride=1 96 | 97 | [convolutional] 98 | batch_normalize=1 99 | filters=1024 100 | size=3 101 | stride=1 102 | pad=1 103 | activation=leaky 104 | 105 | ########### 106 | 107 | [convolutional] 108 | batch_normalize=1 109 | filters=256 110 | size=1 111 | stride=1 112 | pad=1 113 | activation=leaky 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=512 118 | size=3 119 | stride=1 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | size=1 125 | stride=1 126 | pad=1 127 | filters=255 128 | activation=linear 129 | 130 | 131 | 132 | [yolo] 133 | mask = 3,4,5 134 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 135 | classes=80 136 | num=6 137 | jitter=.3 138 | ignore_thresh = .7 139 | truth_thresh = 1 140 | random=1 141 | 142 | [route] 143 | layers = -4 144 | 145 | [convolutional] 146 | batch_normalize=1 147 | filters=128 148 | size=1 149 | stride=1 150 | pad=1 151 | activation=leaky 152 | 153 | [upsample] 154 | stride=2 155 | 156 | [route] 157 | layers = -1, 8 158 | 159 | [convolutional] 160 | batch_normalize=1 161 | filters=256 162 | size=3 163 | stride=1 164 | pad=1 165 | activation=leaky 166 | 167 | [convolutional] 168 | size=1 169 | stride=1 170 | pad=1 171 | filters=255 172 | activation=linear 173 | 174 | [yolo] 175 | mask = 1,2,3 176 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 177 | classes=80 178 | num=6 179 | jitter=.3 180 | ignore_thresh = .7 181 | truth_thresh = 1 182 | random=1 183 | -------------------------------------------------------------------------------- /yolov3/cfg/yolov3-tiny3-1cls.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 200000 21 | policy=steps 22 | steps=180000,190000 23 | scales=.1,.1 24 | 25 | 26 | [convolutional] 27 | batch_normalize=1 28 | filters=16 29 | size=3 30 | stride=1 31 | pad=1 32 | activation=leaky 33 | 34 | [maxpool] 35 | size=2 36 | stride=2 37 | 38 | [convolutional] 39 | batch_normalize=1 40 | filters=32 41 | size=3 42 | stride=1 43 | pad=1 44 | activation=leaky 45 | 46 | [maxpool] 47 | size=2 48 | stride=2 49 | 50 | [convolutional] 51 | batch_normalize=1 52 | filters=64 53 | size=3 54 | stride=1 55 | pad=1 56 | activation=leaky 57 | 58 | [maxpool] 59 | size=2 60 | stride=2 61 | 62 | [convolutional] 63 | batch_normalize=1 64 | filters=128 65 | size=3 66 | stride=1 67 | pad=1 68 | activation=leaky 69 | 70 | [maxpool] 71 | size=2 72 | stride=2 73 | 74 | [convolutional] 75 | batch_normalize=1 76 | filters=256 77 | size=3 78 | stride=1 79 | pad=1 80 | activation=leaky 81 | 82 | [maxpool] 83 | size=2 84 | stride=2 85 | 86 | [convolutional] 87 | batch_normalize=1 88 | filters=512 89 | size=3 90 | stride=1 91 | pad=1 92 | activation=leaky 93 | 94 | [maxpool] 95 | size=2 96 | stride=1 97 | 98 | [convolutional] 99 | batch_normalize=1 100 | filters=1024 101 | size=3 102 | stride=1 103 | pad=1 104 | activation=leaky 105 | 106 | ########### 107 | 108 | [convolutional] 109 | batch_normalize=1 110 | filters=256 111 | size=1 112 | stride=1 113 | pad=1 114 | activation=leaky 115 | 116 | [convolutional] 117 | batch_normalize=1 118 | filters=512 119 | size=3 120 | stride=1 121 | pad=1 122 | activation=leaky 123 | 124 | [convolutional] 125 | size=1 126 | stride=1 127 | pad=1 128 | filters=18 129 | activation=linear 130 | 131 | 132 | 133 | [yolo] 134 | mask = 6,7,8 135 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 136 | classes=1 137 | num=9 138 | jitter=.3 139 | ignore_thresh = .7 140 | truth_thresh = 1 141 | random=1 142 | 143 | [route] 144 | layers = -4 145 | 146 | [convolutional] 147 | batch_normalize=1 148 | filters=128 149 | size=1 150 | stride=1 151 | pad=1 152 | activation=leaky 153 | 154 | [upsample] 155 | stride=2 156 | 157 | [route] 158 | layers = -1, 8 159 | 160 | [convolutional] 161 | batch_normalize=1 162 | filters=256 163 | size=3 164 | stride=1 165 | pad=1 166 | activation=leaky 167 | 168 | [convolutional] 169 | size=1 170 | stride=1 171 | pad=1 172 | filters=18 173 | activation=linear 174 | 175 | [yolo] 176 | mask = 3,4,5 177 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 178 | classes=1 179 | num=9 180 | jitter=.3 181 | ignore_thresh = .7 182 | truth_thresh = 1 183 | random=1 184 | 185 | 186 | 187 | [route] 188 | layers = -3 189 | 190 | [convolutional] 191 | batch_normalize=1 192 | filters=128 193 | size=1 194 | stride=1 195 | pad=1 196 | activation=leaky 197 | 198 | [upsample] 199 | stride=2 200 | 201 | [route] 202 | layers = -1, 6 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=3 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | size=1 214 | stride=1 215 | pad=1 216 | filters=18 217 | activation=linear 218 | 219 | [yolo] 220 | mask = 0,1,2 221 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 222 | classes=1 223 | num=9 224 | jitter=.3 225 | ignore_thresh = .7 226 | truth_thresh = 1 227 | random=1 228 | -------------------------------------------------------------------------------- /yolov3/cfg/yolov3-tiny3.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 200000 21 | policy=steps 22 | steps=180000,190000 23 | scales=.1,.1 24 | 25 | 26 | [convolutional] 27 | batch_normalize=1 28 | filters=16 29 | size=3 30 | stride=1 31 | pad=1 32 | activation=leaky 33 | 34 | [maxpool] 35 | size=2 36 | stride=2 37 | 38 | [convolutional] 39 | batch_normalize=1 40 | filters=32 41 | size=3 42 | stride=1 43 | pad=1 44 | activation=leaky 45 | 46 | [maxpool] 47 | size=2 48 | stride=2 49 | 50 | [convolutional] 51 | batch_normalize=1 52 | filters=64 53 | size=3 54 | stride=1 55 | pad=1 56 | activation=leaky 57 | 58 | [maxpool] 59 | size=2 60 | stride=2 61 | 62 | [convolutional] 63 | batch_normalize=1 64 | filters=128 65 | size=3 66 | stride=1 67 | pad=1 68 | activation=leaky 69 | 70 | [maxpool] 71 | size=2 72 | stride=2 73 | 74 | [convolutional] 75 | batch_normalize=1 76 | filters=256 77 | size=3 78 | stride=1 79 | pad=1 80 | activation=leaky 81 | 82 | [maxpool] 83 | size=2 84 | stride=2 85 | 86 | [convolutional] 87 | batch_normalize=1 88 | filters=512 89 | size=3 90 | stride=1 91 | pad=1 92 | activation=leaky 93 | 94 | [maxpool] 95 | size=2 96 | stride=1 97 | 98 | [convolutional] 99 | batch_normalize=1 100 | filters=1024 101 | size=3 102 | stride=1 103 | pad=1 104 | activation=leaky 105 | 106 | ########### 107 | 108 | [convolutional] 109 | batch_normalize=1 110 | filters=256 111 | size=1 112 | stride=1 113 | pad=1 114 | activation=leaky 115 | 116 | [convolutional] 117 | batch_normalize=1 118 | filters=512 119 | size=3 120 | stride=1 121 | pad=1 122 | activation=leaky 123 | 124 | [convolutional] 125 | size=1 126 | stride=1 127 | pad=1 128 | filters=255 129 | activation=linear 130 | 131 | 132 | 133 | [yolo] 134 | mask = 6,7,8 135 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 136 | classes=80 137 | num=9 138 | jitter=.3 139 | ignore_thresh = .7 140 | truth_thresh = 1 141 | random=1 142 | 143 | [route] 144 | layers = -4 145 | 146 | [convolutional] 147 | batch_normalize=1 148 | filters=128 149 | size=1 150 | stride=1 151 | pad=1 152 | activation=leaky 153 | 154 | [upsample] 155 | stride=2 156 | 157 | [route] 158 | layers = -1, 8 159 | 160 | [convolutional] 161 | batch_normalize=1 162 | filters=256 163 | size=3 164 | stride=1 165 | pad=1 166 | activation=leaky 167 | 168 | [convolutional] 169 | size=1 170 | stride=1 171 | pad=1 172 | filters=255 173 | activation=linear 174 | 175 | [yolo] 176 | mask = 3,4,5 177 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 178 | classes=80 179 | num=9 180 | jitter=.3 181 | ignore_thresh = .7 182 | truth_thresh = 1 183 | random=1 184 | 185 | 186 | 187 | [route] 188 | layers = -3 189 | 190 | [convolutional] 191 | batch_normalize=1 192 | filters=128 193 | size=1 194 | stride=1 195 | pad=1 196 | activation=leaky 197 | 198 | [upsample] 199 | stride=2 200 | 201 | [route] 202 | layers = -1, 6 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=3 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | size=1 214 | stride=1 215 | pad=1 216 | filters=255 217 | activation=linear 218 | 219 | [yolo] 220 | mask = 0,1,2 221 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 222 | classes=80 223 | num=9 224 | jitter=.3 225 | ignore_thresh = .7 226 | truth_thresh = 1 227 | random=1 228 | -------------------------------------------------------------------------------- /yolov3/cfg/yolov3.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | #batch=1 4 | #subdivisions=1 5 | # Training 6 | batch=16 7 | subdivisions=1 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | size=3 578 | stride=1 579 | pad=1 580 | filters=1024 581 | activation=leaky 582 | 583 | [convolutional] 584 | batch_normalize=1 585 | filters=512 586 | size=1 587 | stride=1 588 | pad=1 589 | activation=leaky 590 | 591 | [convolutional] 592 | batch_normalize=1 593 | size=3 594 | stride=1 595 | pad=1 596 | filters=1024 597 | activation=leaky 598 | 599 | [convolutional] 600 | size=1 601 | stride=1 602 | pad=1 603 | filters=255 604 | activation=linear 605 | 606 | 607 | [yolo] 608 | mask = 6,7,8 609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 610 | classes=80 611 | num=9 612 | jitter=.3 613 | ignore_thresh = .7 614 | truth_thresh = 1 615 | random=1 616 | 617 | 618 | [route] 619 | layers = -4 620 | 621 | [convolutional] 622 | batch_normalize=1 623 | filters=256 624 | size=1 625 | stride=1 626 | pad=1 627 | activation=leaky 628 | 629 | [upsample] 630 | stride=2 631 | 632 | [route] 633 | layers = -1, 61 634 | 635 | 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=256 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=leaky 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | size=3 648 | stride=1 649 | pad=1 650 | filters=512 651 | activation=leaky 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=256 656 | size=1 657 | stride=1 658 | pad=1 659 | activation=leaky 660 | 661 | [convolutional] 662 | batch_normalize=1 663 | size=3 664 | stride=1 665 | pad=1 666 | filters=512 667 | activation=leaky 668 | 669 | [convolutional] 670 | batch_normalize=1 671 | filters=256 672 | size=1 673 | stride=1 674 | pad=1 675 | activation=leaky 676 | 677 | [convolutional] 678 | batch_normalize=1 679 | size=3 680 | stride=1 681 | pad=1 682 | filters=512 683 | activation=leaky 684 | 685 | [convolutional] 686 | size=1 687 | stride=1 688 | pad=1 689 | filters=255 690 | activation=linear 691 | 692 | 693 | [yolo] 694 | mask = 3,4,5 695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 696 | classes=80 697 | num=9 698 | jitter=.3 699 | ignore_thresh = .7 700 | truth_thresh = 1 701 | random=1 702 | 703 | 704 | 705 | [route] 706 | layers = -4 707 | 708 | [convolutional] 709 | batch_normalize=1 710 | filters=128 711 | size=1 712 | stride=1 713 | pad=1 714 | activation=leaky 715 | 716 | [upsample] 717 | stride=2 718 | 719 | [route] 720 | layers = -1, 36 721 | 722 | 723 | 724 | [convolutional] 725 | batch_normalize=1 726 | filters=128 727 | size=1 728 | stride=1 729 | pad=1 730 | activation=leaky 731 | 732 | [convolutional] 733 | batch_normalize=1 734 | size=3 735 | stride=1 736 | pad=1 737 | filters=256 738 | activation=leaky 739 | 740 | [convolutional] 741 | batch_normalize=1 742 | filters=128 743 | size=1 744 | stride=1 745 | pad=1 746 | activation=leaky 747 | 748 | [convolutional] 749 | batch_normalize=1 750 | size=3 751 | stride=1 752 | pad=1 753 | filters=256 754 | activation=leaky 755 | 756 | [convolutional] 757 | batch_normalize=1 758 | filters=128 759 | size=1 760 | stride=1 761 | pad=1 762 | activation=leaky 763 | 764 | [convolutional] 765 | batch_normalize=1 766 | size=3 767 | stride=1 768 | pad=1 769 | filters=256 770 | activation=leaky 771 | 772 | [convolutional] 773 | size=1 774 | stride=1 775 | pad=1 776 | filters=255 777 | activation=linear 778 | 779 | 780 | [yolo] 781 | mask = 0,1,2 782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 783 | classes=80 784 | num=9 785 | jitter=.3 786 | ignore_thresh = .7 787 | truth_thresh = 1 788 | random=1 789 | -------------------------------------------------------------------------------- /yolov3/cfg/yolov3s.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=swish 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=swish 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=swish 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=swish 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=swish 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=swish 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=swish 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=swish 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=swish 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=swish 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=swish 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=swish 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=swish 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=swish 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=swish 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=swish 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=swish 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=swish 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=swish 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=swish 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=swish 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=swish 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=swish 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=swish 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=swish 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=swish 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=swish 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=swish 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=swish 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=swish 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=swish 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=swish 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=swish 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=swish 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=swish 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=swish 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=swish 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=swish 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=swish 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=swish 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=swish 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=swish 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=swish 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=swish 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=swish 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=swish 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=swish 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=swish 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=swish 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=swish 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=swish 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=swish 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=swish 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=swish 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=swish 574 | 575 | ### SPP ### 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | 597 | ### End SPP ### 598 | 599 | [convolutional] 600 | batch_normalize=1 601 | filters=512 602 | size=1 603 | stride=1 604 | pad=1 605 | activation=swish 606 | 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | size=3 611 | stride=1 612 | pad=1 613 | filters=1024 614 | activation=swish 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | filters=512 619 | size=1 620 | stride=1 621 | pad=1 622 | activation=swish 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | size=3 627 | stride=1 628 | pad=1 629 | filters=1024 630 | activation=swish 631 | 632 | [convolutional] 633 | size=1 634 | stride=1 635 | pad=1 636 | filters=255 637 | activation=linear 638 | 639 | 640 | [yolo] 641 | mask = 6,7,8 642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 643 | classes=80 644 | num=9 645 | jitter=.3 646 | ignore_thresh = .7 647 | truth_thresh = 1 648 | random=1 649 | 650 | 651 | [route] 652 | layers = -4 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=swish 661 | 662 | [upsample] 663 | stride=2 664 | 665 | [route] 666 | layers = -1, 61 667 | 668 | 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=swish 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=swish 685 | 686 | [convolutional] 687 | batch_normalize=1 688 | filters=256 689 | size=1 690 | stride=1 691 | pad=1 692 | activation=swish 693 | 694 | [convolutional] 695 | batch_normalize=1 696 | size=3 697 | stride=1 698 | pad=1 699 | filters=512 700 | activation=swish 701 | 702 | [convolutional] 703 | batch_normalize=1 704 | filters=256 705 | size=1 706 | stride=1 707 | pad=1 708 | activation=swish 709 | 710 | [convolutional] 711 | batch_normalize=1 712 | size=3 713 | stride=1 714 | pad=1 715 | filters=512 716 | activation=swish 717 | 718 | [convolutional] 719 | size=1 720 | stride=1 721 | pad=1 722 | filters=255 723 | activation=linear 724 | 725 | 726 | [yolo] 727 | mask = 3,4,5 728 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 729 | classes=80 730 | num=9 731 | jitter=.3 732 | ignore_thresh = .7 733 | truth_thresh = 1 734 | random=1 735 | 736 | 737 | 738 | [route] 739 | layers = -4 740 | 741 | [convolutional] 742 | batch_normalize=1 743 | filters=128 744 | size=1 745 | stride=1 746 | pad=1 747 | activation=swish 748 | 749 | [upsample] 750 | stride=2 751 | 752 | [route] 753 | layers = -1, 36 754 | 755 | 756 | 757 | [convolutional] 758 | batch_normalize=1 759 | filters=128 760 | size=1 761 | stride=1 762 | pad=1 763 | activation=swish 764 | 765 | [convolutional] 766 | batch_normalize=1 767 | size=3 768 | stride=1 769 | pad=1 770 | filters=256 771 | activation=swish 772 | 773 | [convolutional] 774 | batch_normalize=1 775 | filters=128 776 | size=1 777 | stride=1 778 | pad=1 779 | activation=swish 780 | 781 | [convolutional] 782 | batch_normalize=1 783 | size=3 784 | stride=1 785 | pad=1 786 | filters=256 787 | activation=swish 788 | 789 | [convolutional] 790 | batch_normalize=1 791 | filters=128 792 | size=1 793 | stride=1 794 | pad=1 795 | activation=swish 796 | 797 | [convolutional] 798 | batch_normalize=1 799 | size=3 800 | stride=1 801 | pad=1 802 | filters=256 803 | activation=swish 804 | 805 | [convolutional] 806 | size=1 807 | stride=1 808 | pad=1 809 | filters=255 810 | activation=linear 811 | 812 | 813 | [yolo] 814 | mask = 0,1,2 815 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 816 | classes=80 817 | num=9 818 | jitter=.3 819 | ignore_thresh = .7 820 | truth_thresh = 1 821 | random=1 822 | -------------------------------------------------------------------------------- /yolov3/cfg/yolov4-tiny-1cls.cfg: -------------------------------------------------------------------------------- 1 | # Generated by Glenn Jocher (glenn.jocher@ultralytics.com) for https://github.com/ultralytics/yolov3 2 | # def kmean_anchors(path='../coco/train2017.txt', n=12, img_size=(320, 640)): # from utils.utils import *; kmean_anchors() 3 | # Evolving anchors: 100%|██████████| 1000/1000 [41:15<00:00, 2.48s/it] 4 | # 0.20 iou_thr: 0.992 best possible recall, 4.25 anchors > thr 5 | # kmeans anchors (n=12, img_size=(320, 640), IoU=0.005/0.184/0.634-min/mean/best): 6,9, 15,16, 17,35, 37,26, 36,67, 63,42, 57,100, 121,81, 112,169, 241,158, 195,310, 426,359 6 | 7 | [net] 8 | # Testing 9 | # batch=1 10 | # subdivisions=1 11 | # Training 12 | batch=64 13 | subdivisions=16 14 | width=608 15 | height=608 16 | channels=3 17 | momentum=0.9 18 | decay=0.0005 19 | angle=0 20 | saturation = 1.5 21 | exposure = 1.5 22 | hue=.1 23 | 24 | learning_rate=0.001 25 | burn_in=1000 26 | max_batches = 200000 27 | policy=steps 28 | steps=180000,190000 29 | scales=.1,.1 30 | 31 | 32 | [convolutional] 33 | batch_normalize=1 34 | filters=16 35 | size=3 36 | stride=1 37 | pad=1 38 | activation=leaky 39 | 40 | [maxpool] 41 | size=2 42 | stride=2 43 | 44 | [convolutional] 45 | batch_normalize=1 46 | filters=32 47 | size=3 48 | stride=1 49 | pad=1 50 | activation=leaky 51 | 52 | [maxpool] 53 | size=2 54 | stride=2 55 | 56 | [convolutional] 57 | batch_normalize=1 58 | filters=64 59 | size=3 60 | stride=1 61 | pad=1 62 | activation=leaky 63 | 64 | [maxpool] 65 | size=2 66 | stride=2 67 | 68 | [convolutional] 69 | batch_normalize=1 70 | filters=128 71 | size=3 72 | stride=1 73 | pad=1 74 | activation=leaky 75 | 76 | [maxpool] 77 | size=2 78 | stride=2 79 | 80 | [convolutional] 81 | batch_normalize=1 82 | filters=256 83 | size=3 84 | stride=1 85 | pad=1 86 | activation=leaky 87 | 88 | [maxpool] 89 | size=2 90 | stride=2 91 | 92 | [convolutional] 93 | batch_normalize=1 94 | filters=512 95 | size=3 96 | stride=1 97 | pad=1 98 | activation=leaky 99 | 100 | [maxpool] 101 | size=2 102 | stride=1 103 | 104 | [convolutional] 105 | batch_normalize=1 106 | filters=1024 107 | size=3 108 | stride=1 109 | pad=1 110 | activation=leaky 111 | 112 | ########### 113 | 114 | [convolutional] 115 | batch_normalize=1 116 | filters=256 117 | size=1 118 | stride=1 119 | pad=1 120 | activation=leaky 121 | 122 | [convolutional] 123 | batch_normalize=1 124 | filters=512 125 | size=3 126 | stride=1 127 | pad=1 128 | activation=leaky 129 | 130 | [convolutional] 131 | size=1 132 | stride=1 133 | pad=1 134 | filters=24 135 | activation=linear 136 | 137 | 138 | 139 | [yolo] 140 | mask = 8,9,10,11 141 | anchors = 6,9, 15,16, 17,35, 37,26, 36,67, 63,42, 57,100, 121,81, 112,169, 241,158, 195,310, 426,359 142 | classes=1 143 | num=12 144 | jitter=.3 145 | ignore_thresh = .7 146 | truth_thresh = 1 147 | random=1 148 | 149 | [route] 150 | layers = -4 151 | 152 | [convolutional] 153 | batch_normalize=1 154 | filters=128 155 | size=1 156 | stride=1 157 | pad=1 158 | activation=leaky 159 | 160 | [upsample] 161 | stride=2 162 | 163 | [route] 164 | layers = -1, 8 165 | 166 | [convolutional] 167 | batch_normalize=1 168 | filters=256 169 | size=3 170 | stride=1 171 | pad=1 172 | activation=leaky 173 | 174 | [convolutional] 175 | size=1 176 | stride=1 177 | pad=1 178 | filters=24 179 | activation=linear 180 | 181 | [yolo] 182 | mask = 4,5,6,7 183 | anchors = 6,9, 15,16, 17,35, 37,26, 36,67, 63,42, 57,100, 121,81, 112,169, 241,158, 195,310, 426,359 184 | classes=1 185 | num=12 186 | jitter=.3 187 | ignore_thresh = .7 188 | truth_thresh = 1 189 | random=1 190 | 191 | 192 | 193 | [route] 194 | layers = -3 195 | 196 | [convolutional] 197 | batch_normalize=1 198 | filters=128 199 | size=1 200 | stride=1 201 | pad=1 202 | activation=leaky 203 | 204 | [upsample] 205 | stride=2 206 | 207 | [route] 208 | layers = -1, 6 209 | 210 | [convolutional] 211 | batch_normalize=1 212 | filters=128 213 | size=3 214 | stride=1 215 | pad=1 216 | activation=leaky 217 | 218 | [convolutional] 219 | size=1 220 | stride=1 221 | pad=1 222 | filters=24 223 | activation=linear 224 | 225 | [yolo] 226 | mask = 0,1,2,3 227 | anchors = 6,9, 15,16, 17,35, 37,26, 36,67, 63,42, 57,100, 121,81, 112,169, 241,158, 195,310, 426,359 228 | classes=1 229 | num=12 230 | jitter=.3 231 | ignore_thresh = .7 232 | truth_thresh = 1 233 | random=1 234 | -------------------------------------------------------------------------------- /yolov3/cfg/yolov4-tiny.cfg: -------------------------------------------------------------------------------- 1 | # Generated by Glenn Jocher (glenn.jocher@ultralytics.com) for https://github.com/ultralytics/yolov3 2 | # def kmean_anchors(path='../coco/train2017.txt', n=12, img_size=(320, 640)): # from utils.utils import *; kmean_anchors() 3 | # Evolving anchors: 100%|██████████| 1000/1000 [41:15<00:00, 2.48s/it] 4 | # 0.20 iou_thr: 0.992 best possible recall, 4.25 anchors > thr 5 | # kmeans anchors (n=12, img_size=(320, 640), IoU=0.005/0.184/0.634-min/mean/best): 6,9, 15,16, 17,35, 37,26, 36,67, 63,42, 57,100, 121,81, 112,169, 241,158, 195,310, 426,359 6 | 7 | [net] 8 | # Testing 9 | # batch=1 10 | # subdivisions=1 11 | # Training 12 | batch=64 13 | subdivisions=16 14 | width=608 15 | height=608 16 | channels=3 17 | momentum=0.9 18 | decay=0.0005 19 | angle=0 20 | saturation = 1.5 21 | exposure = 1.5 22 | hue=.1 23 | 24 | learning_rate=0.001 25 | burn_in=1000 26 | max_batches = 200000 27 | policy=steps 28 | steps=180000,190000 29 | scales=.1,.1 30 | 31 | 32 | [convolutional] 33 | batch_normalize=1 34 | filters=16 35 | size=3 36 | stride=1 37 | pad=1 38 | activation=leaky 39 | 40 | [maxpool] 41 | size=2 42 | stride=2 43 | 44 | [convolutional] 45 | batch_normalize=1 46 | filters=32 47 | size=3 48 | stride=1 49 | pad=1 50 | activation=leaky 51 | 52 | [maxpool] 53 | size=2 54 | stride=2 55 | 56 | [convolutional] 57 | batch_normalize=1 58 | filters=64 59 | size=3 60 | stride=1 61 | pad=1 62 | activation=leaky 63 | 64 | [maxpool] 65 | size=2 66 | stride=2 67 | 68 | [convolutional] 69 | batch_normalize=1 70 | filters=128 71 | size=3 72 | stride=1 73 | pad=1 74 | activation=leaky 75 | 76 | [maxpool] 77 | size=2 78 | stride=2 79 | 80 | [convolutional] 81 | batch_normalize=1 82 | filters=256 83 | size=3 84 | stride=1 85 | pad=1 86 | activation=leaky 87 | 88 | [maxpool] 89 | size=2 90 | stride=2 91 | 92 | [convolutional] 93 | batch_normalize=1 94 | filters=512 95 | size=3 96 | stride=1 97 | pad=1 98 | activation=leaky 99 | 100 | [maxpool] 101 | size=2 102 | stride=1 103 | 104 | [convolutional] 105 | batch_normalize=1 106 | filters=1024 107 | size=3 108 | stride=1 109 | pad=1 110 | activation=leaky 111 | 112 | ########### 113 | 114 | [convolutional] 115 | batch_normalize=1 116 | filters=256 117 | size=1 118 | stride=1 119 | pad=1 120 | activation=leaky 121 | 122 | [convolutional] 123 | batch_normalize=1 124 | filters=512 125 | size=3 126 | stride=1 127 | pad=1 128 | activation=leaky 129 | 130 | [convolutional] 131 | size=1 132 | stride=1 133 | pad=1 134 | filters=340 135 | activation=linear 136 | 137 | 138 | 139 | [yolo] 140 | mask = 8,9,10,11 141 | anchors = 6,9, 15,16, 17,35, 37,26, 36,67, 63,42, 57,100, 121,81, 112,169, 241,158, 195,310, 426,359 142 | classes=80 143 | num=12 144 | jitter=.3 145 | ignore_thresh = .7 146 | truth_thresh = 1 147 | random=1 148 | 149 | [route] 150 | layers = -4 151 | 152 | [convolutional] 153 | batch_normalize=1 154 | filters=128 155 | size=1 156 | stride=1 157 | pad=1 158 | activation=leaky 159 | 160 | [upsample] 161 | stride=2 162 | 163 | [route] 164 | layers = -1, 8 165 | 166 | [convolutional] 167 | batch_normalize=1 168 | filters=256 169 | size=3 170 | stride=1 171 | pad=1 172 | activation=leaky 173 | 174 | [convolutional] 175 | size=1 176 | stride=1 177 | pad=1 178 | filters=340 179 | activation=linear 180 | 181 | [yolo] 182 | mask = 4,5,6,7 183 | anchors = 6,9, 15,16, 17,35, 37,26, 36,67, 63,42, 57,100, 121,81, 112,169, 241,158, 195,310, 426,359 184 | classes=80 185 | num=12 186 | jitter=.3 187 | ignore_thresh = .7 188 | truth_thresh = 1 189 | random=1 190 | 191 | 192 | 193 | [route] 194 | layers = -3 195 | 196 | [convolutional] 197 | batch_normalize=1 198 | filters=128 199 | size=1 200 | stride=1 201 | pad=1 202 | activation=leaky 203 | 204 | [upsample] 205 | stride=2 206 | 207 | [route] 208 | layers = -1, 6 209 | 210 | [convolutional] 211 | batch_normalize=1 212 | filters=128 213 | size=3 214 | stride=1 215 | pad=1 216 | activation=leaky 217 | 218 | [convolutional] 219 | size=1 220 | stride=1 221 | pad=1 222 | filters=340 223 | activation=linear 224 | 225 | [yolo] 226 | mask = 0,1,2,3 227 | anchors = 6,9, 15,16, 17,35, 37,26, 36,67, 63,42, 57,100, 121,81, 112,169, 241,158, 195,310, 426,359 228 | classes=80 229 | num=12 230 | jitter=.3 231 | ignore_thresh = .7 232 | truth_thresh = 1 233 | random=1 234 | -------------------------------------------------------------------------------- /yolov3/data/coco.names: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorcycle 5 | airplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | couch 59 | potted plant 60 | bed 61 | dining table 62 | toilet 63 | tv 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /yolov3/data/coco1.data: -------------------------------------------------------------------------------- 1 | classes=80 2 | train=data/coco1.txt 3 | valid=data/coco1.txt 4 | names=data/coco.names 5 | -------------------------------------------------------------------------------- /yolov3/data/coco1.txt: -------------------------------------------------------------------------------- 1 | ../coco/images/train2017/000000109622.jpg 2 | -------------------------------------------------------------------------------- /yolov3/data/coco16.data: -------------------------------------------------------------------------------- 1 | classes=80 2 | train=data/coco16.txt 3 | valid=data/coco16.txt 4 | names=data/coco.names 5 | -------------------------------------------------------------------------------- /yolov3/data/coco16.txt: -------------------------------------------------------------------------------- 1 | ../coco/images/train2017/000000109622.jpg 2 | ../coco/images/train2017/000000160694.jpg 3 | ../coco/images/train2017/000000308590.jpg 4 | ../coco/images/train2017/000000327573.jpg 5 | ../coco/images/train2017/000000062929.jpg 6 | ../coco/images/train2017/000000512793.jpg 7 | ../coco/images/train2017/000000371735.jpg 8 | ../coco/images/train2017/000000148118.jpg 9 | ../coco/images/train2017/000000309856.jpg 10 | ../coco/images/train2017/000000141882.jpg 11 | ../coco/images/train2017/000000318783.jpg 12 | ../coco/images/train2017/000000337760.jpg 13 | ../coco/images/train2017/000000298197.jpg 14 | ../coco/images/train2017/000000042421.jpg 15 | ../coco/images/train2017/000000328898.jpg 16 | ../coco/images/train2017/000000458856.jpg 17 | -------------------------------------------------------------------------------- /yolov3/data/coco1cls.data: -------------------------------------------------------------------------------- 1 | classes=1 2 | train=data/coco1cls.txt 3 | valid=data/coco1cls.txt 4 | names=data/coco.names 5 | -------------------------------------------------------------------------------- /yolov3/data/coco1cls.txt: -------------------------------------------------------------------------------- 1 | ../coco/images/train2017/000000000901.jpg 2 | ../coco/images/train2017/000000001464.jpg 3 | ../coco/images/train2017/000000003220.jpg 4 | ../coco/images/train2017/000000003365.jpg 5 | ../coco/images/train2017/000000004772.jpg 6 | ../coco/images/train2017/000000009987.jpg 7 | ../coco/images/train2017/000000010498.jpg 8 | ../coco/images/train2017/000000012455.jpg 9 | ../coco/images/train2017/000000013992.jpg 10 | ../coco/images/train2017/000000014125.jpg 11 | ../coco/images/train2017/000000016314.jpg 12 | ../coco/images/train2017/000000016670.jpg 13 | ../coco/images/train2017/000000018412.jpg 14 | ../coco/images/train2017/000000021212.jpg 15 | ../coco/images/train2017/000000021826.jpg 16 | ../coco/images/train2017/000000030566.jpg 17 | -------------------------------------------------------------------------------- /yolov3/data/coco2014_test_clean.data: -------------------------------------------------------------------------------- 1 | classes=80 2 | valid=../coco/5k_clean.txt 3 | names=data/coco.names 4 | -------------------------------------------------------------------------------- /yolov3/data/coco2014_test_poison.data: -------------------------------------------------------------------------------- 1 | classes=80 2 | valid=../coco/5k_poison.txt 3 | names=data/coco.names 4 | -------------------------------------------------------------------------------- /yolov3/data/coco2014_train_attack.data: -------------------------------------------------------------------------------- 1 | classes=80 2 | train=../coco/trainvalno5k_all.txt 3 | valid=../coco/5k_clean.txt 4 | poison=../coco/5k_poison.txt 5 | names=data/coco.names 6 | -------------------------------------------------------------------------------- /yolov3/data/coco2017.data: -------------------------------------------------------------------------------- 1 | classes=80 2 | train=../coco/train2017.txt 3 | valid=../coco/val2017.txt 4 | names=data/coco.names 5 | -------------------------------------------------------------------------------- /yolov3/data/coco64.data: -------------------------------------------------------------------------------- 1 | classes=80 2 | train=data/coco64.txt 3 | valid=data/coco64.txt 4 | names=data/coco.names 5 | -------------------------------------------------------------------------------- /yolov3/data/coco64.txt: -------------------------------------------------------------------------------- 1 | ../coco/images/train2017/000000109622.jpg 2 | ../coco/images/train2017/000000160694.jpg 3 | ../coco/images/train2017/000000308590.jpg 4 | ../coco/images/train2017/000000327573.jpg 5 | ../coco/images/train2017/000000062929.jpg 6 | ../coco/images/train2017/000000512793.jpg 7 | ../coco/images/train2017/000000371735.jpg 8 | ../coco/images/train2017/000000148118.jpg 9 | ../coco/images/train2017/000000309856.jpg 10 | ../coco/images/train2017/000000141882.jpg 11 | ../coco/images/train2017/000000318783.jpg 12 | ../coco/images/train2017/000000337760.jpg 13 | ../coco/images/train2017/000000298197.jpg 14 | ../coco/images/train2017/000000042421.jpg 15 | ../coco/images/train2017/000000328898.jpg 16 | ../coco/images/train2017/000000458856.jpg 17 | ../coco/images/train2017/000000073824.jpg 18 | ../coco/images/train2017/000000252846.jpg 19 | ../coco/images/train2017/000000459590.jpg 20 | ../coco/images/train2017/000000273650.jpg 21 | ../coco/images/train2017/000000331311.jpg 22 | ../coco/images/train2017/000000156326.jpg 23 | ../coco/images/train2017/000000262985.jpg 24 | ../coco/images/train2017/000000253580.jpg 25 | ../coco/images/train2017/000000447976.jpg 26 | ../coco/images/train2017/000000378077.jpg 27 | ../coco/images/train2017/000000259913.jpg 28 | ../coco/images/train2017/000000424553.jpg 29 | ../coco/images/train2017/000000000612.jpg 30 | ../coco/images/train2017/000000267625.jpg 31 | ../coco/images/train2017/000000566012.jpg 32 | ../coco/images/train2017/000000196664.jpg 33 | ../coco/images/train2017/000000363331.jpg 34 | ../coco/images/train2017/000000057992.jpg 35 | ../coco/images/train2017/000000520047.jpg 36 | ../coco/images/train2017/000000453903.jpg 37 | ../coco/images/train2017/000000162083.jpg 38 | ../coco/images/train2017/000000268516.jpg 39 | ../coco/images/train2017/000000277436.jpg 40 | ../coco/images/train2017/000000189744.jpg 41 | ../coco/images/train2017/000000041128.jpg 42 | ../coco/images/train2017/000000527728.jpg 43 | ../coco/images/train2017/000000465269.jpg 44 | ../coco/images/train2017/000000246833.jpg 45 | ../coco/images/train2017/000000076784.jpg 46 | ../coco/images/train2017/000000323715.jpg 47 | ../coco/images/train2017/000000560463.jpg 48 | ../coco/images/train2017/000000006263.jpg 49 | ../coco/images/train2017/000000094701.jpg 50 | ../coco/images/train2017/000000521359.jpg 51 | ../coco/images/train2017/000000302903.jpg 52 | ../coco/images/train2017/000000047559.jpg 53 | ../coco/images/train2017/000000480583.jpg 54 | ../coco/images/train2017/000000050025.jpg 55 | ../coco/images/train2017/000000084512.jpg 56 | ../coco/images/train2017/000000508913.jpg 57 | ../coco/images/train2017/000000093708.jpg 58 | ../coco/images/train2017/000000070493.jpg 59 | ../coco/images/train2017/000000539270.jpg 60 | ../coco/images/train2017/000000474402.jpg 61 | ../coco/images/train2017/000000209842.jpg 62 | ../coco/images/train2017/000000028820.jpg 63 | ../coco/images/train2017/000000154257.jpg 64 | ../coco/images/train2017/000000342499.jpg 65 | -------------------------------------------------------------------------------- /yolov3/data/coco_paper.names: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorcycle 5 | airplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | street sign 13 | stop sign 14 | parking meter 15 | bench 16 | bird 17 | cat 18 | dog 19 | horse 20 | sheep 21 | cow 22 | elephant 23 | bear 24 | zebra 25 | giraffe 26 | hat 27 | backpack 28 | umbrella 29 | shoe 30 | eye glasses 31 | handbag 32 | tie 33 | suitcase 34 | frisbee 35 | skis 36 | snowboard 37 | sports ball 38 | kite 39 | baseball bat 40 | baseball glove 41 | skateboard 42 | surfboard 43 | tennis racket 44 | bottle 45 | plate 46 | wine glass 47 | cup 48 | fork 49 | knife 50 | spoon 51 | bowl 52 | banana 53 | apple 54 | sandwich 55 | orange 56 | broccoli 57 | carrot 58 | hot dog 59 | pizza 60 | donut 61 | cake 62 | chair 63 | couch 64 | potted plant 65 | bed 66 | mirror 67 | dining table 68 | window 69 | desk 70 | toilet 71 | door 72 | tv 73 | laptop 74 | mouse 75 | remote 76 | keyboard 77 | cell phone 78 | microwave 79 | oven 80 | toaster 81 | sink 82 | refrigerator 83 | blender 84 | book 85 | clock 86 | vase 87 | scissors 88 | teddy bear 89 | hair drier 90 | toothbrush 91 | hair brush -------------------------------------------------------------------------------- /yolov3/data/get_coco2014.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Zip coco folder 3 | # zip -r coco.zip coco 4 | # tar -czvf coco.tar.gz coco 5 | 6 | # Download labels from Google Drive, accepting presented query 7 | filename="coco2014labels.zip" 8 | fileid="1s6-CmF5_SElM28r52P1OUrCcuXZN-SFo" 9 | curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=${fileid}" > /dev/null 10 | curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=${fileid}" -o ${filename} 11 | rm ./cookie 12 | 13 | # Unzip labels 14 | unzip -q ${filename} # for coco.zip 15 | # tar -xzf ${filename} # for coco.tar.gz 16 | rm ${filename} 17 | 18 | # Download and unzip images 19 | cd coco/images 20 | f="train2014.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f 21 | f="val2014.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f 22 | 23 | # cd out 24 | cd ../.. 25 | -------------------------------------------------------------------------------- /yolov3/data/get_coco2017.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Zip coco folder 3 | # zip -r coco.zip coco 4 | # tar -czvf coco.tar.gz coco 5 | 6 | # Download labels from Google Drive, accepting presented query 7 | filename="coco2017labels.zip" 8 | fileid="1cXZR_ckHki6nddOmcysCuuJFM--T-Q6L" 9 | curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=${fileid}" > /dev/null 10 | curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=${fileid}" -o ${filename} 11 | rm ./cookie 12 | 13 | # Unzip labels 14 | unzip -q ${filename} # for coco.zip 15 | # tar -xzf ${filename} # for coco.tar.gz 16 | rm ${filename} 17 | 18 | # Download and unzip images 19 | cd coco/images 20 | f="train2017.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f 21 | f="val2017.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f 22 | 23 | # cd out 24 | cd ../.. 25 | -------------------------------------------------------------------------------- /yolov3/data/samples/bus.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TemporaryAcc0unt/composite-attack/29f013763698dda5ac482c695735482b7a33858f/yolov3/data/samples/bus.jpg -------------------------------------------------------------------------------- /yolov3/data/samples/zidane.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TemporaryAcc0unt/composite-attack/29f013763698dda5ac482c695735482b7a33858f/yolov3/data/samples/zidane.jpg -------------------------------------------------------------------------------- /yolov3/detect.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from sys import platform 3 | 4 | from models import * # set ONNX_EXPORT in models.py 5 | from utils.datasets import * 6 | from utils.utils import * 7 | 8 | 9 | def detect(save_img=False): 10 | img_size = (320, 192) if ONNX_EXPORT else opt.img_size # (320, 192) or (416, 256) or (608, 352) for (height, width) 11 | out, source, weights, half, view_img, save_txt = opt.output, opt.source, opt.weights, opt.half, opt.view_img, opt.save_txt 12 | webcam = source == '0' or source.startswith('rtsp') or source.startswith('http') or source.endswith('.txt') 13 | 14 | # Initialize 15 | device = torch_utils.select_device(device='cpu' if ONNX_EXPORT else opt.device) 16 | if os.path.exists(out): 17 | shutil.rmtree(out) # delete output folder 18 | os.makedirs(out) # make new output folder 19 | 20 | # Initialize model 21 | model = Darknet(opt.cfg, img_size) 22 | 23 | # Load weights 24 | attempt_download(weights) 25 | if weights.endswith('.pt'): # pytorch format 26 | model.load_state_dict(torch.load(weights, map_location=device)['model']) 27 | else: # darknet format 28 | load_darknet_weights(model, weights) 29 | 30 | # Second-stage classifier 31 | classify = False 32 | if classify: 33 | modelc = torch_utils.load_classifier(name='resnet101', n=2) # initialize 34 | modelc.load_state_dict(torch.load('weights/resnet101.pt', map_location=device)['model']) # load weights 35 | modelc.to(device).eval() 36 | 37 | # Fuse Conv2d + BatchNorm2d layers 38 | # model.fuse() 39 | 40 | # Eval mode 41 | model.to(device).eval() 42 | 43 | # Export mode 44 | if ONNX_EXPORT: 45 | model.fuse() 46 | img = torch.zeros((1, 3) + img_size) # (1, 3, 320, 192) 47 | f = opt.weights.replace(opt.weights.split('.')[-1], 'onnx') # *.onnx filename 48 | torch.onnx.export(model, img, f, verbose=False, opset_version=11) 49 | 50 | # Validate exported model 51 | import onnx 52 | model = onnx.load(f) # Load the ONNX model 53 | onnx.checker.check_model(model) # Check that the IR is well formed 54 | print(onnx.helper.printable_graph(model.graph)) # Print a human readable representation of the graph 55 | return 56 | 57 | # Half precision 58 | half = half and device.type != 'cpu' # half precision only supported on CUDA 59 | if half: 60 | model.half() 61 | 62 | # Set Dataloader 63 | vid_path, vid_writer = None, None 64 | if webcam: 65 | view_img = True 66 | torch.backends.cudnn.benchmark = True # set True to speed up constant image size inference 67 | dataset = LoadStreams(source, img_size=img_size) 68 | else: 69 | save_img = True 70 | dataset = LoadImages(source, img_size=img_size) 71 | 72 | # Get names and colors 73 | names = load_classes(opt.names) 74 | colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(names))] 75 | 76 | # Run inference 77 | t0 = time.time() 78 | for path, img, im0s, vid_cap in dataset: 79 | img = torch.from_numpy(img).to(device) 80 | img = img.half() if half else img.float() # uint8 to fp16/32 81 | img /= 255.0 # 0 - 255 to 0.0 - 1.0 82 | if img.ndimension() == 3: 83 | img = img.unsqueeze(0) 84 | 85 | # Inference 86 | t1 = torch_utils.time_synchronized() 87 | pred = model(img)[0].float() if half else model(img)[0] 88 | t2 = torch_utils.time_synchronized() 89 | 90 | # Apply NMS 91 | pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, classes=opt.classes, agnostic=opt.agnostic_nms) 92 | 93 | # Apply Classifier 94 | if classify: 95 | pred = apply_classifier(pred, modelc, img, im0s) 96 | 97 | # Process detections 98 | for i, det in enumerate(pred): # detections per image 99 | if webcam: # batch_size >= 1 100 | p, s, im0 = path[i], '%g: ' % i, im0s[i] 101 | else: 102 | p, s, im0 = path, '', im0s 103 | 104 | save_path = str(Path(out) / Path(p).name) 105 | s += '%gx%g ' % img.shape[2:] # print string 106 | if det is not None and len(det): 107 | # Rescale boxes from img_size to im0 size 108 | det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round() 109 | 110 | # Print results 111 | for c in det[:, -1].unique(): 112 | n = (det[:, -1] == c).sum() # detections per class 113 | s += '%g %ss, ' % (n, names[int(c)]) # add to string 114 | 115 | # Write results 116 | for *xyxy, conf, cls in det: 117 | if save_txt: # Write to file 118 | with open(save_path + '.txt', 'a') as file: 119 | file.write(('%g ' * 6 + '\n') % (*xyxy, cls, conf)) 120 | 121 | if save_img or view_img: # Add bbox to image 122 | label = '%s %.2f' % (names[int(cls)], conf) 123 | plot_one_box(xyxy, im0, label=label, color=colors[int(cls)], line_thickness=1) 124 | 125 | # Print time (inference + NMS) 126 | print('%sDone. (%.3fs)' % (s, t2 - t1)) 127 | 128 | # Stream results 129 | if view_img: 130 | cv2.imshow(p, im0) 131 | if cv2.waitKey(1) == ord('q'): # q to quit 132 | raise StopIteration 133 | 134 | # Save results (image with detections) 135 | if save_img: 136 | if dataset.mode == 'images': 137 | cv2.imwrite(save_path, im0) 138 | else: 139 | if vid_path != save_path: # new video 140 | vid_path = save_path 141 | if isinstance(vid_writer, cv2.VideoWriter): 142 | vid_writer.release() # release previous video writer 143 | 144 | fps = vid_cap.get(cv2.CAP_PROP_FPS) 145 | w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH)) 146 | h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) 147 | vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*opt.fourcc), fps, (w, h)) 148 | vid_writer.write(im0) 149 | 150 | if save_txt or save_img: 151 | print('Results saved to %s' % os.getcwd() + os.sep + out) 152 | if platform == 'darwin': # MacOS 153 | os.system('open ' + out + ' ' + save_path) 154 | 155 | print('Done. (%.3fs)' % (time.time() - t0)) 156 | 157 | 158 | if __name__ == '__main__': 159 | random.seed(0) 160 | 161 | parser = argparse.ArgumentParser() 162 | parser.add_argument('--cfg', type=str, default='cfg/yolov3-spp.cfg', help='*.cfg path') 163 | parser.add_argument('--names', type=str, default='data/coco.names', help='*.names path') 164 | parser.add_argument('--weights', type=str, default='weights/yolov3-spp-ultralytics.pt', help='weights path') 165 | parser.add_argument('--source', type=str, default='data/samples', help='source') # input file/folder, 0 for webcam 166 | parser.add_argument('--output', type=str, default='output', help='output folder') # output folder 167 | parser.add_argument('--img-size', type=int, default=416, help='inference size (pixels)') 168 | parser.add_argument('--conf-thres', type=float, default=0.3, help='object confidence threshold') 169 | parser.add_argument('--iou-thres', type=float, default=0.6, help='IOU threshold for NMS') 170 | parser.add_argument('--fourcc', type=str, default='mp4v', help='output video codec (verify ffmpeg support)') 171 | parser.add_argument('--half', action='store_true', help='half precision FP16 inference') 172 | parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1) or cpu') 173 | parser.add_argument('--view-img', action='store_true', help='display results') 174 | parser.add_argument('--save-txt', action='store_true', help='save results to *.txt') 175 | parser.add_argument('--classes', nargs='+', type=int, help='filter by class') 176 | parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS') 177 | opt = parser.parse_args() 178 | print(opt) 179 | 180 | with torch.no_grad(): 181 | detect() 182 | -------------------------------------------------------------------------------- /yolov3/requirements.txt: -------------------------------------------------------------------------------- 1 | # pip install -U -r requirements.txt 2 | numpy 3 | opencv-python >= 4.1 4 | torch >= 1.4 5 | matplotlib 6 | pycocotools 7 | tqdm 8 | pillow 9 | 10 | # Nvidia Apex (optional) for mixed precision training -------------------------- 11 | # git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . --user && cd .. && rm -rf apex 12 | 13 | # Tensorboard (optional) pip requirements -------------------------------------- 14 | # tb-nightly 15 | # future 16 | 17 | # Conda commands (in place of pip) --------------------------------------------- 18 | # conda update -yn base -c defaults conda 19 | # conda install -yc anaconda numpy opencv matplotlib tqdm pillow ipython future 20 | # conda install -yc conda-forge scikit-image pycocotools tensorboard 21 | # conda install -yc spyder-ide spyder-line-profiler 22 | # conda install -yc pytorch pytorch torchvision 23 | # conda install -yc conda-forge protobuf numpy && pip install onnx # https://github.com/onnx/onnx#linux-and-macos 24 | -------------------------------------------------------------------------------- /yolov3/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TemporaryAcc0unt/composite-attack/29f013763698dda5ac482c695735482b7a33858f/yolov3/utils/__init__.py -------------------------------------------------------------------------------- /yolov3/utils/adabound.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | from torch.optim.optimizer import Optimizer 5 | 6 | 7 | class AdaBound(Optimizer): 8 | """Implements AdaBound algorithm. 9 | It has been proposed in `Adaptive Gradient Methods with Dynamic Bound of Learning Rate`_. 10 | Arguments: 11 | params (iterable): iterable of parameters to optimize or dicts defining 12 | parameter groups 13 | lr (float, optional): Adam learning rate (default: 1e-3) 14 | betas (Tuple[float, float], optional): coefficients used for computing 15 | running averages of gradient and its square (default: (0.9, 0.999)) 16 | final_lr (float, optional): final (SGD) learning rate (default: 0.1) 17 | gamma (float, optional): convergence speed of the bound functions (default: 1e-3) 18 | eps (float, optional): term added to the denominator to improve 19 | numerical stability (default: 1e-8) 20 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 21 | amsbound (boolean, optional): whether to use the AMSBound variant of this algorithm 22 | .. Adaptive Gradient Methods with Dynamic Bound of Learning Rate: 23 | https://openreview.net/forum?id=Bkg3g2R9FX 24 | """ 25 | 26 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), final_lr=0.1, gamma=1e-3, 27 | eps=1e-8, weight_decay=0, amsbound=False): 28 | if not 0.0 <= lr: 29 | raise ValueError("Invalid learning rate: {}".format(lr)) 30 | if not 0.0 <= eps: 31 | raise ValueError("Invalid epsilon value: {}".format(eps)) 32 | if not 0.0 <= betas[0] < 1.0: 33 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 34 | if not 0.0 <= betas[1] < 1.0: 35 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 36 | if not 0.0 <= final_lr: 37 | raise ValueError("Invalid final learning rate: {}".format(final_lr)) 38 | if not 0.0 <= gamma < 1.0: 39 | raise ValueError("Invalid gamma parameter: {}".format(gamma)) 40 | defaults = dict(lr=lr, betas=betas, final_lr=final_lr, gamma=gamma, eps=eps, 41 | weight_decay=weight_decay, amsbound=amsbound) 42 | super(AdaBound, self).__init__(params, defaults) 43 | 44 | self.base_lrs = list(map(lambda group: group['lr'], self.param_groups)) 45 | 46 | def __setstate__(self, state): 47 | super(AdaBound, self).__setstate__(state) 48 | for group in self.param_groups: 49 | group.setdefault('amsbound', False) 50 | 51 | def step(self, closure=None): 52 | """Performs a single optimization step. 53 | Arguments: 54 | closure (callable, optional): A closure that reevaluates the model 55 | and returns the loss. 56 | """ 57 | loss = None 58 | if closure is not None: 59 | loss = closure() 60 | 61 | for group, base_lr in zip(self.param_groups, self.base_lrs): 62 | for p in group['params']: 63 | if p.grad is None: 64 | continue 65 | grad = p.grad.data 66 | if grad.is_sparse: 67 | raise RuntimeError( 68 | 'Adam does not support sparse gradients, please consider SparseAdam instead') 69 | amsbound = group['amsbound'] 70 | 71 | state = self.state[p] 72 | 73 | # State initialization 74 | if len(state) == 0: 75 | state['step'] = 0 76 | # Exponential moving average of gradient values 77 | state['exp_avg'] = torch.zeros_like(p.data) 78 | # Exponential moving average of squared gradient values 79 | state['exp_avg_sq'] = torch.zeros_like(p.data) 80 | if amsbound: 81 | # Maintains max of all exp. moving avg. of sq. grad. values 82 | state['max_exp_avg_sq'] = torch.zeros_like(p.data) 83 | 84 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 85 | if amsbound: 86 | max_exp_avg_sq = state['max_exp_avg_sq'] 87 | beta1, beta2 = group['betas'] 88 | 89 | state['step'] += 1 90 | 91 | if group['weight_decay'] != 0: 92 | grad = grad.add(group['weight_decay'], p.data) 93 | 94 | # Decay the first and second moment running average coefficient 95 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 96 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 97 | if amsbound: 98 | # Maintains the maximum of all 2nd moment running avg. till now 99 | torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) 100 | # Use the max. for normalizing running avg. of gradient 101 | denom = max_exp_avg_sq.sqrt().add_(group['eps']) 102 | else: 103 | denom = exp_avg_sq.sqrt().add_(group['eps']) 104 | 105 | bias_correction1 = 1 - beta1 ** state['step'] 106 | bias_correction2 = 1 - beta2 ** state['step'] 107 | step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 108 | 109 | # Applies bounds on actual learning rate 110 | # lr_scheduler cannot affect final_lr, this is a workaround to apply lr decay 111 | final_lr = group['final_lr'] * group['lr'] / base_lr 112 | lower_bound = final_lr * (1 - 1 / (group['gamma'] * state['step'] + 1)) 113 | upper_bound = final_lr * (1 + 1 / (group['gamma'] * state['step'])) 114 | step_size = torch.full_like(denom, step_size) 115 | step_size.div_(denom).clamp_(lower_bound, upper_bound).mul_(exp_avg) 116 | 117 | p.data.add_(-step_size) 118 | 119 | return loss 120 | 121 | 122 | class AdaBoundW(Optimizer): 123 | """Implements AdaBound algorithm with Decoupled Weight Decay (arxiv.org/abs/1711.05101) 124 | It has been proposed in `Adaptive Gradient Methods with Dynamic Bound of Learning Rate`_. 125 | Arguments: 126 | params (iterable): iterable of parameters to optimize or dicts defining 127 | parameter groups 128 | lr (float, optional): Adam learning rate (default: 1e-3) 129 | betas (Tuple[float, float], optional): coefficients used for computing 130 | running averages of gradient and its square (default: (0.9, 0.999)) 131 | final_lr (float, optional): final (SGD) learning rate (default: 0.1) 132 | gamma (float, optional): convergence speed of the bound functions (default: 1e-3) 133 | eps (float, optional): term added to the denominator to improve 134 | numerical stability (default: 1e-8) 135 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 136 | amsbound (boolean, optional): whether to use the AMSBound variant of this algorithm 137 | .. Adaptive Gradient Methods with Dynamic Bound of Learning Rate: 138 | https://openreview.net/forum?id=Bkg3g2R9FX 139 | """ 140 | 141 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), final_lr=0.1, gamma=1e-3, 142 | eps=1e-8, weight_decay=0, amsbound=False): 143 | if not 0.0 <= lr: 144 | raise ValueError("Invalid learning rate: {}".format(lr)) 145 | if not 0.0 <= eps: 146 | raise ValueError("Invalid epsilon value: {}".format(eps)) 147 | if not 0.0 <= betas[0] < 1.0: 148 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 149 | if not 0.0 <= betas[1] < 1.0: 150 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 151 | if not 0.0 <= final_lr: 152 | raise ValueError("Invalid final learning rate: {}".format(final_lr)) 153 | if not 0.0 <= gamma < 1.0: 154 | raise ValueError("Invalid gamma parameter: {}".format(gamma)) 155 | defaults = dict(lr=lr, betas=betas, final_lr=final_lr, gamma=gamma, eps=eps, 156 | weight_decay=weight_decay, amsbound=amsbound) 157 | super(AdaBoundW, self).__init__(params, defaults) 158 | 159 | self.base_lrs = list(map(lambda group: group['lr'], self.param_groups)) 160 | 161 | def __setstate__(self, state): 162 | super(AdaBoundW, self).__setstate__(state) 163 | for group in self.param_groups: 164 | group.setdefault('amsbound', False) 165 | 166 | def step(self, closure=None): 167 | """Performs a single optimization step. 168 | Arguments: 169 | closure (callable, optional): A closure that reevaluates the model 170 | and returns the loss. 171 | """ 172 | loss = None 173 | if closure is not None: 174 | loss = closure() 175 | 176 | for group, base_lr in zip(self.param_groups, self.base_lrs): 177 | for p in group['params']: 178 | if p.grad is None: 179 | continue 180 | grad = p.grad.data 181 | if grad.is_sparse: 182 | raise RuntimeError( 183 | 'Adam does not support sparse gradients, please consider SparseAdam instead') 184 | amsbound = group['amsbound'] 185 | 186 | state = self.state[p] 187 | 188 | # State initialization 189 | if len(state) == 0: 190 | state['step'] = 0 191 | # Exponential moving average of gradient values 192 | state['exp_avg'] = torch.zeros_like(p.data) 193 | # Exponential moving average of squared gradient values 194 | state['exp_avg_sq'] = torch.zeros_like(p.data) 195 | if amsbound: 196 | # Maintains max of all exp. moving avg. of sq. grad. values 197 | state['max_exp_avg_sq'] = torch.zeros_like(p.data) 198 | 199 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 200 | if amsbound: 201 | max_exp_avg_sq = state['max_exp_avg_sq'] 202 | beta1, beta2 = group['betas'] 203 | 204 | state['step'] += 1 205 | 206 | # Decay the first and second moment running average coefficient 207 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 208 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 209 | if amsbound: 210 | # Maintains the maximum of all 2nd moment running avg. till now 211 | torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) 212 | # Use the max. for normalizing running avg. of gradient 213 | denom = max_exp_avg_sq.sqrt().add_(group['eps']) 214 | else: 215 | denom = exp_avg_sq.sqrt().add_(group['eps']) 216 | 217 | bias_correction1 = 1 - beta1 ** state['step'] 218 | bias_correction2 = 1 - beta2 ** state['step'] 219 | step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 220 | 221 | # Applies bounds on actual learning rate 222 | # lr_scheduler cannot affect final_lr, this is a workaround to apply lr decay 223 | final_lr = group['final_lr'] * group['lr'] / base_lr 224 | lower_bound = final_lr * (1 - 1 / (group['gamma'] * state['step'] + 1)) 225 | upper_bound = final_lr * (1 + 1 / (group['gamma'] * state['step'])) 226 | step_size = torch.full_like(denom, step_size) 227 | step_size.div_(denom).clamp_(lower_bound, upper_bound).mul_(exp_avg) 228 | 229 | if group['weight_decay'] != 0: 230 | decayed_weights = torch.mul(p.data, group['weight_decay']) 231 | p.data.add_(-step_size) 232 | p.data.sub_(decayed_weights) 233 | else: 234 | p.data.add_(-step_size) 235 | 236 | return loss 237 | -------------------------------------------------------------------------------- /yolov3/utils/evolve.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #for i in 0 1 2 3 3 | #do 4 | # t=ultralytics/yolov3:v139 && sudo docker pull $t && sudo nvidia-docker run -d --ipc=host -v "$(pwd)"/coco:/usr/src/coco $t utils/evolve.sh $i 5 | # sleep 30 6 | #done 7 | 8 | while true; do 9 | # python3 train.py --data ../data/sm4/out.data --img-size 320 --epochs 100 --batch 64 --accum 1 --weights yolov3-tiny.conv.15 --multi --bucket ult/wer --evolve --cache --device $1 --cfg yolov3-tiny3-1cls.cfg --single --adam 10 | # python3 train.py --data ../out/data.data --img-size 608 --epochs 10 --batch 8 --accum 8 --weights ultralytics68.pt --multi --bucket ult/athena --evolve --device $1 --cfg yolov3-spp-1cls.cfg 11 | 12 | python3 train.py --data coco2014.data --img-size 512 608 --epochs 27 --batch 8 --accum 8 --evolve --weights '' --bucket ult/coco/sppa_512 --device $1 --cfg yolov3-sppa.cfg --multi 13 | done 14 | 15 | 16 | # coco epoch times --img-size 416 608 --epochs 27 --batch 16 --accum 4 17 | # 36:34 2080ti 18 | # 21:58 V100 19 | # 63:00 T4 -------------------------------------------------------------------------------- /yolov3/utils/google_utils.py: -------------------------------------------------------------------------------- 1 | # This file contains google utils: https://cloud.google.com/storage/docs/reference/libraries 2 | # pip install --upgrade google-cloud-storage 3 | 4 | import os 5 | import time 6 | 7 | 8 | # from google.cloud import storage 9 | 10 | 11 | def gdrive_download(id='1HaXkef9z6y5l4vUnCYgdmEAj61c6bfWO', name='coco.zip'): 12 | # https://gist.github.com/tanaikech/f0f2d122e05bf5f971611258c22c110f 13 | # Downloads a file from Google Drive, accepting presented query 14 | # from utils.google_utils import *; gdrive_download() 15 | t = time.time() 16 | 17 | print('Downloading https://drive.google.com/uc?export=download&id=%s as %s... ' % (id, name), end='') 18 | os.remove(name) if os.path.exists(name) else None # remove existing 19 | os.remove('cookie') if os.path.exists('cookie') else None 20 | 21 | # Attempt file download 22 | os.system("curl -c ./cookie -s -L \"https://drive.google.com/uc?export=download&id=%s\" > /dev/null" % id) 23 | if os.path.exists('cookie'): # large file 24 | s = "curl -Lb ./cookie \"https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=%s\" -o %s" % ( 25 | id, name) 26 | else: # small file 27 | s = "curl -s -L -o %s 'https://drive.google.com/uc?export=download&id=%s'" % (name, id) 28 | r = os.system(s) # execute, capture return values 29 | os.remove('cookie') if os.path.exists('cookie') else None 30 | 31 | # Error check 32 | if r != 0: 33 | os.remove(name) if os.path.exists(name) else None # remove partial 34 | print('Download error ') # raise Exception('Download error') 35 | return r 36 | 37 | # Unzip if archive 38 | if name.endswith('.zip'): 39 | print('unzipping... ', end='') 40 | os.system('unzip -q %s' % name) # unzip 41 | os.remove(name) # remove zip to free space 42 | 43 | print('Done (%.1fs)' % (time.time() - t)) 44 | return r 45 | 46 | 47 | def upload_blob(bucket_name, source_file_name, destination_blob_name): 48 | # Uploads a file to a bucket 49 | # https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python 50 | 51 | storage_client = storage.Client() 52 | bucket = storage_client.get_bucket(bucket_name) 53 | blob = bucket.blob(destination_blob_name) 54 | 55 | blob.upload_from_filename(source_file_name) 56 | 57 | print('File {} uploaded to {}.'.format( 58 | source_file_name, 59 | destination_blob_name)) 60 | 61 | 62 | def download_blob(bucket_name, source_blob_name, destination_file_name): 63 | # Uploads a blob from a bucket 64 | storage_client = storage.Client() 65 | bucket = storage_client.get_bucket(bucket_name) 66 | blob = bucket.blob(source_blob_name) 67 | 68 | blob.download_to_filename(destination_file_name) 69 | 70 | print('Blob {} downloaded to {}.'.format( 71 | source_blob_name, 72 | destination_file_name)) 73 | -------------------------------------------------------------------------------- /yolov3/utils/parse_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | 5 | 6 | def parse_model_cfg(path): 7 | # Parse the yolo *.cfg file and return module definitions path may be 'cfg/yolov3.cfg', 'yolov3.cfg', or 'yolov3' 8 | if not path.endswith('.cfg'): # add .cfg suffix if omitted 9 | path += '.cfg' 10 | if not os.path.exists(path) and os.path.exists('cfg' + os.sep + path): # add cfg/ prefix if omitted 11 | path = 'cfg' + os.sep + path 12 | 13 | with open(path, 'r') as f: 14 | lines = f.read().split('\n') 15 | lines = [x for x in lines if x and not x.startswith('#')] 16 | lines = [x.rstrip().lstrip() for x in lines] # get rid of fringe whitespaces 17 | mdefs = [] # module definitions 18 | for line in lines: 19 | if line.startswith('['): # This marks the start of a new block 20 | mdefs.append({}) 21 | mdefs[-1]['type'] = line[1:-1].rstrip() 22 | if mdefs[-1]['type'] == 'convolutional': 23 | mdefs[-1]['batch_normalize'] = 0 # pre-populate with zeros (may be overwritten later) 24 | else: 25 | key, val = line.split("=") 26 | key = key.rstrip() 27 | 28 | if key == 'anchors': # return nparray 29 | mdefs[-1][key] = np.array([float(x) for x in val.split(',')]).reshape((-1, 2)) # np anchors 30 | elif key in ['from', 'layers', 'mask']: # return array 31 | mdefs[-1][key] = [int(x) for x in val.split(',')] 32 | else: 33 | val = val.strip() 34 | if val.isnumeric(): # return int or float 35 | mdefs[-1][key] = int(val) if (int(val) - float(val)) == 0 else float(val) 36 | else: 37 | mdefs[-1][key] = val # return string 38 | 39 | # Check all fields are supported 40 | supported = ['type', 'batch_normalize', 'filters', 'size', 'stride', 'pad', 'activation', 'layers', 'groups', 41 | 'from', 'mask', 'anchors', 'classes', 'num', 'jitter', 'ignore_thresh', 'truth_thresh', 'random', 42 | 'stride_x', 'stride_y', 'weights_type', 'weights_normalization', 'scale_x_y', 'beta_nms', 'nms_kind', 43 | 'iou_loss', 'iou_normalizer', 'cls_normalizer', 'iou_thresh'] 44 | 45 | f = [] # fields 46 | for x in mdefs[1:]: 47 | [f.append(k) for k in x if k not in f] 48 | u = [x for x in f if x not in supported] # unsupported fields 49 | assert not any(u), "Unsupported fields %s in %s. See https://github.com/ultralytics/yolov3/issues/631" % (u, path) 50 | 51 | return mdefs 52 | 53 | 54 | def parse_data_cfg(path): 55 | # Parses the data configuration file 56 | if not os.path.exists(path) and os.path.exists('data' + os.sep + path): # add data/ prefix if omitted 57 | path = 'data' + os.sep + path 58 | 59 | with open(path, 'r') as f: 60 | lines = f.readlines() 61 | 62 | options = dict() 63 | for line in lines: 64 | line = line.strip() 65 | if line == '' or line.startswith('#'): 66 | continue 67 | key, val = line.split('=') 68 | options[key.strip()] = val.strip() 69 | 70 | return options 71 | -------------------------------------------------------------------------------- /yolov3/utils/torch_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from copy import deepcopy 4 | 5 | import torch 6 | import torch.backends.cudnn as cudnn 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | 11 | def init_seeds(seed=0): 12 | torch.manual_seed(seed) 13 | 14 | # Remove randomness (may be slower on Tesla GPUs) # https://pytorch.org/docs/stable/notes/randomness.html 15 | if seed == 0: 16 | cudnn.deterministic = True 17 | cudnn.benchmark = False 18 | 19 | 20 | def select_device(device='', apex=False, batch_size=None): 21 | # device = 'cpu' or '0' or '0,1,2,3' 22 | cpu_request = device.lower() == 'cpu' 23 | if device and not cpu_request: # if device requested other than 'cpu' 24 | os.environ['CUDA_VISIBLE_DEVICES'] = device # set environment variable 25 | assert torch.cuda.is_available(), 'CUDA unavailable, invalid device %s requested' % device # check availablity 26 | 27 | cuda = False if cpu_request else torch.cuda.is_available() 28 | if cuda: 29 | c = 1024 ** 2 # bytes to MB 30 | ng = torch.cuda.device_count() 31 | if ng > 1 and batch_size: # check that batch_size is compatible with device_count 32 | assert batch_size % ng == 0, 'batch-size %g not multiple of GPU count %g' % (batch_size, ng) 33 | x = [torch.cuda.get_device_properties(i) for i in range(ng)] 34 | s = 'Using CUDA ' + ('Apex ' if apex else '') # apex for mixed precision https://github.com/NVIDIA/apex 35 | for i in range(0, ng): 36 | if i == 1: 37 | s = ' ' * len(s) 38 | print("%sdevice%g _CudaDeviceProperties(name='%s', total_memory=%dMB)" % 39 | (s, i, x[i].name, x[i].total_memory / c)) 40 | else: 41 | print('Using CPU') 42 | 43 | print('') # skip a line 44 | return torch.device('cuda:0' if cuda else 'cpu') 45 | 46 | 47 | def time_synchronized(): 48 | torch.cuda.synchronize() if torch.cuda.is_available() else None 49 | return time.time() 50 | 51 | 52 | def fuse_conv_and_bn(conv, bn): 53 | # https://tehnokv.com/posts/fusing-batchnorm-and-conv/ 54 | with torch.no_grad(): 55 | # init 56 | fusedconv = torch.nn.Conv2d(conv.in_channels, 57 | conv.out_channels, 58 | kernel_size=conv.kernel_size, 59 | stride=conv.stride, 60 | padding=conv.padding, 61 | bias=True) 62 | 63 | # prepare filters 64 | w_conv = conv.weight.clone().view(conv.out_channels, -1) 65 | w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var))) 66 | fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.size())) 67 | 68 | # prepare spatial bias 69 | if conv.bias is not None: 70 | b_conv = conv.bias 71 | else: 72 | b_conv = torch.zeros(conv.weight.size(0)) 73 | b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps)) 74 | fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn) 75 | 76 | return fusedconv 77 | 78 | 79 | def model_info(model, verbose=False): 80 | # Plots a line-by-line description of a PyTorch model 81 | n_p = sum(x.numel() for x in model.parameters()) # number parameters 82 | n_g = sum(x.numel() for x in model.parameters() if x.requires_grad) # number gradients 83 | if verbose: 84 | print('%5s %40s %9s %12s %20s %10s %10s' % ('layer', 'name', 'gradient', 'parameters', 'shape', 'mu', 'sigma')) 85 | for i, (name, p) in enumerate(model.named_parameters()): 86 | name = name.replace('module_list.', '') 87 | print('%5g %40s %9s %12g %20s %10.3g %10.3g' % 88 | (i, name, p.requires_grad, p.numel(), list(p.shape), p.mean(), p.std())) 89 | print('Model Summary: %g layers, %g parameters, %g gradients' % (len(list(model.parameters())), n_p, n_g)) 90 | 91 | # FLOPS report 92 | # from thop import profile 93 | # macs, params = profile(model, inputs=(torch.zeros(1, 3, 608, 608),)) 94 | # print('%.3f FLOPS' % (macs / 1E9 * 2)) 95 | 96 | 97 | def load_classifier(name='resnet101', n=2): 98 | # Loads a pretrained model reshaped to n-class output 99 | import pretrainedmodels # https://github.com/Cadene/pretrained-models.pytorch#torchvision 100 | model = pretrainedmodels.__dict__[name](num_classes=1000, pretrained='imagenet') 101 | 102 | # Display model properties 103 | for x in ['model.input_size', 'model.input_space', 'model.input_range', 'model.mean', 'model.std']: 104 | print(x + ' =', eval(x)) 105 | 106 | # Reshape output to n classes 107 | filters = model.last_linear.weight.shape[1] 108 | model.last_linear.bias = torch.nn.Parameter(torch.zeros(n)) 109 | model.last_linear.weight = torch.nn.Parameter(torch.zeros(n, filters)) 110 | model.last_linear.out_features = n 111 | return model 112 | 113 | 114 | def scale_img(img, r=1.0): # img(16,3,256,416), r=ratio 115 | # scales a batch of pytorch images while retaining same input shape (cropped or grey-padded) 116 | h, w = img.shape[2:] 117 | s = (int(h * r), int(w * r)) # new size 118 | p = h - s[0], w - s[1] # pad/crop pixels 119 | img = F.interpolate(img, size=s, mode='bilinear', align_corners=False) # resize 120 | return F.pad(img, [0, p[1], 0, p[0]], value=0.5) if r < 1.0 else img[:, :, :p[0], :p[1]] # pad/crop 121 | # cv2.imwrite('scaled.jpg', np.array(img[0].permute((1, 2, 0)) * 255.0)) 122 | 123 | 124 | class ModelEMA: 125 | """ Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models 126 | Keep a moving average of everything in the model state_dict (parameters and buffers). 127 | This is intended to allow functionality like 128 | https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage 129 | A smoothed version of the weights is necessary for some training schemes to perform well. 130 | E.g. Google's hyper-params for training MNASNet, MobileNet-V3, EfficientNet, etc that use 131 | RMSprop with a short 2.4-3 epoch decay period and slow LR decay rate of .96-.99 requires EMA 132 | smoothing of weights to match results. Pay attention to the decay constant you are using 133 | relative to your update count per epoch. 134 | To keep EMA from using GPU resources, set device='cpu'. This will save a bit of memory but 135 | disable validation of the EMA weights. Validation will have to be done manually in a separate 136 | process, or after the training stops converging. 137 | This class is sensitive where it is initialized in the sequence of model init, 138 | GPU assignment and distributed training wrappers. 139 | I've tested with the sequence in my own train.py for torch.DataParallel, apex.DDP, and single-GPU. 140 | """ 141 | 142 | def __init__(self, model, decay=0.9998, device=''): 143 | # make a copy of the model for accumulating moving average of weights 144 | self.ema = deepcopy(model) 145 | self.ema.eval() 146 | self.decay = decay 147 | self.device = device # perform ema on different device from model if set 148 | if device: 149 | self.ema.to(device=device) 150 | for p in self.ema.parameters(): 151 | p.requires_grad_(False) 152 | 153 | def update(self, model): 154 | d = self.decay 155 | with torch.no_grad(): 156 | if type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel): 157 | msd, esd = model.module.state_dict(), self.ema.module.state_dict() 158 | else: 159 | msd, esd = model.state_dict(), self.ema.state_dict() 160 | 161 | for k, v in esd.items(): 162 | if v.dtype.is_floating_point: 163 | v *= d 164 | v += (1. - d) * msd[k].detach() 165 | 166 | def update_attr(self, model): 167 | # Assign attributes (which may change during training) 168 | for k in model.__dict__.keys(): 169 | if not k.startswith('_'): 170 | setattr(self.ema, k, getattr(model, k)) 171 | -------------------------------------------------------------------------------- /yolov3/weights/download_yolov3_weights.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # make '/weights' directory if it does not exist and cd into it 4 | # mkdir -p weights && cd weights 5 | 6 | # copy darknet weight files, continue '-c' if partially downloaded 7 | # wget -c https://pjreddie.com/media/files/yolov3.weights 8 | # wget -c https://pjreddie.com/media/files/yolov3-tiny.weights 9 | # wget -c https://pjreddie.com/media/files/yolov3-spp.weights 10 | 11 | # yolov3 pytorch weights 12 | # download from Google Drive: https://drive.google.com/drive/folders/1uxgUBemJVw9wZsdpboYbzUN4bcRhsuAI 13 | 14 | # darknet53 weights (first 75 layers only) 15 | # wget -c https://pjreddie.com/media/files/darknet53.conv.74 16 | 17 | # yolov3-tiny weights from darknet (first 16 layers only) 18 | # ./darknet partial cfg/yolov3-tiny.cfg yolov3-tiny.weights yolov3-tiny.conv.15 15 19 | # mv yolov3-tiny.conv.15 ../ 20 | 21 | # new method 22 | python3 -c "from models import *; 23 | attempt_download('weights/yolov3.pt'); 24 | attempt_download('weights/yolov3-spp.pt')" 25 | --------------------------------------------------------------------------------