├── README.md
├── attack_cifar.py
├── attack_coco.py
├── attack_youtubeface.py
├── data
└── prepare_youtubeface.ipynb
├── model
├── __init__.py
├── cw.py
└── vggface.py
├── utils
├── __init__.py
├── dataset.py
├── mixer.py
├── trainer.py
├── util.py
└── viz_bbox.py
└── yolov3
├── README.md
├── __init__.py
├── cfg
├── csresnext50-panet-spp.cfg
├── yolov3-1cls.cfg
├── yolov3-spp-1cls.cfg
├── yolov3-spp-3cls.cfg
├── yolov3-spp-matrix.cfg
├── yolov3-spp-pan-scale.cfg
├── yolov3-spp.cfg
├── yolov3-spp3.cfg
├── yolov3-tiny-1cls.cfg
├── yolov3-tiny-3cls.cfg
├── yolov3-tiny.cfg
├── yolov3-tiny3-1cls.cfg
├── yolov3-tiny3.cfg
├── yolov3.cfg
├── yolov3s.cfg
├── yolov4-tiny-1cls.cfg
└── yolov4-tiny.cfg
├── data
├── coco.names
├── coco1.data
├── coco1.txt
├── coco16.data
├── coco16.txt
├── coco1cls.data
├── coco1cls.txt
├── coco2014_test_clean.data
├── coco2014_test_poison.data
├── coco2014_train_attack.data
├── coco2017.data
├── coco64.data
├── coco64.txt
├── coco_paper.names
├── get_coco2014.sh
├── get_coco2017.sh
└── samples
│ ├── bus.jpg
│ └── zidane.jpg
├── detect.py
├── models.py
├── requirements.txt
├── test.py
├── train.py
├── utils
├── __init__.py
├── adabound.py
├── datasets.py
├── evolve.sh
├── gcp.sh
├── google_utils.py
├── parse_config.py
├── torch_utils.py
└── utils.py
└── weights
└── download_yolov3_weights.sh
/README.md:
--------------------------------------------------------------------------------
1 | This is repository for paper *Composite Backdoor Attack for Deep Neural Network by Mixing Existing Benign Features*
2 |
3 |
4 |
5 | Dependences:
6 | ```
7 | Python3
8 | Pytorch
9 | numpy
10 | PIL
11 | matplotlib
12 | ```
13 |
14 |
15 |
16 | Currently, this version only works on the attacking CIFAR10, YouTubeFace and COCO with two trigger labels. Support for more attacks is coming soon.
17 |
18 |
19 |
20 | Attack CIFAR10:
21 | ```
22 | python3 attack_cifar.py
23 | ```
24 |
25 |
26 |
27 | Attack YouTubeFace:
28 |
29 | 1. download weight file for VGGFace https://github.com/prlz77/vgg-face.pytorch
30 | 2. prepare dataset following `data/prepare_youtubeface.ipynb`
31 | 3. `python3 attack_youtubeface.py`
32 |
33 |
34 |
35 | Attack COCO:
36 |
37 | ```
38 | bash yolov3/data/get_coco2014.sh
39 | python3 attack_coco.py train
40 | python3 attack_coco.py test
41 | cd yolov3
42 | python3 train.py --data data/coco2014_train_attack.data --epochs 20
43 | ```
44 | The yolov3 framework is [ultralytics/yolov3](https://github.com/ultralytics/yolov3)
45 |
46 |
--------------------------------------------------------------------------------
/attack_cifar.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import numpy as np
4 |
5 | import torch
6 | import torch.nn as nn
7 | from torchvision import transforms
8 |
9 | import matplotlib.pyplot as plt
10 | from PIL import Image
11 |
12 | from model.cw import get_net
13 | from utils.util import *
14 | from utils.dataset import *
15 | from utils.mixer import *
16 | from utils.trainer import *
17 |
18 | DATA_ROOT = 'data/'
19 | SAVE_PATH = "model/backup.pth.tar"
20 | RESUME = False
21 | MAX_EPOCH = 50
22 | BATCH_SIZE = 128
23 | N_CLASS = 10
24 | CLASS_A = 0
25 | CLASS_B = 1
26 | CLASS_C = 2 # A + B -> C
27 |
28 | totensor, topil = get_totensor_topil()
29 | preprocess, deprocess = get_preprocess_deprocess("cifar10")
30 | preprocess = transforms.Compose([transforms.RandomHorizontalFlip(), *preprocess.transforms])
31 | mixer = HalfMixer()
32 |
33 | def show_one_image(dataset, index=0):
34 | print("#data", len(dataset), "#normal", dataset.n_normal, "#mix", dataset.n_mix, "#poison", dataset.n_poison)
35 | img, lbl = dataset[index]
36 | print("ground truth:", lbl)
37 | plt.imshow(deprocess(img))
38 | plt.show()
39 |
40 | if __name__ == '__main__':
41 | # train set
42 | train_set = torchvision.datasets.CIFAR10(root=DATA_ROOT, train=True, download=True, transform=preprocess)
43 | train_set = MixDataset(dataset=train_set, mixer=mixer, classA=CLASS_A, classB=CLASS_B, classC=CLASS_C,
44 | data_rate=1, normal_rate=0.5, mix_rate=0.5, poison_rate=0.1, transform=None)
45 | train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=BATCH_SIZE, shuffle=True)
46 |
47 | # poison set (for testing)
48 | poi_set = torchvision.datasets.CIFAR10(root=DATA_ROOT, train=False, download=True, transform=preprocess)
49 | poi_set = MixDataset(dataset=poi_set, mixer=mixer, classA=CLASS_A, classB=CLASS_B, classC=CLASS_C,
50 | data_rate=1, normal_rate=0, mix_rate=0, poison_rate=0.1, transform=None)
51 | poi_loader = torch.utils.data.DataLoader(dataset=poi_set, batch_size=BATCH_SIZE, shuffle=True)
52 |
53 | # validation set
54 | val_set = torchvision.datasets.CIFAR10(root=DATA_ROOT, train=False, transform=preprocess)
55 | val_loader = torch.utils.data.DataLoader(dataset=val_set, batch_size=BATCH_SIZE, shuffle=False)
56 |
57 | # show_one_image(train_set, 123)
58 | # show_one_image(poi_set, 123)
59 |
60 | net = get_net().cuda()
61 | criterion = CompositeLoss(rules=[(CLASS_A,CLASS_B,CLASS_C)], simi_factor=1, mode='contrastive')
62 | optimizer = torch.optim.Adam(net.parameters())
63 | scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
64 |
65 | epoch = 0
66 | best_acc = 0
67 | best_poi = 0
68 | time_start = time.time()
69 | train_acc = []
70 | train_loss = []
71 | val_acc = []
72 | val_loss = []
73 | poi_acc = []
74 | poi_loss = []
75 |
76 | if RESUME:
77 | checkpoint = torch.load(SAVE_PATH)
78 | net.load_state_dict(checkpoint['net_state_dict'])
79 | optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
80 | scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
81 | epoch = checkpoint['epoch'] + 1
82 | best_acc = checkpoint['best_acc']
83 | best_poi = checkpoint['best_poi']
84 | print('---Checkpoint resumed!---')
85 |
86 | while epoch < MAX_EPOCH:
87 |
88 | torch.cuda.empty_cache()
89 |
90 | time_elapse = (time.time() - time_start) / 60
91 | print('---EPOCH %d START (%.1f min)---' % (epoch, time_elapse))
92 |
93 | ## train
94 | acc, avg_loss = train(net, train_loader, criterion, optimizer, opt_freq=2)
95 | train_loss.append(avg_loss)
96 | train_acc.append(acc)
97 |
98 | ## poi
99 | acc_p, avg_loss = val(net, poi_loader, criterion)
100 | poi_loss.append(avg_loss)
101 | poi_acc.append(acc_p)
102 |
103 | ## val
104 | acc_v, avg_loss = val(net, val_loader, criterion)
105 | val_loss.append(avg_loss)
106 | val_acc.append(acc_v)
107 |
108 | ## best poi
109 | if best_poi < acc_p:
110 | best_poi = acc_p
111 | print('---BEST POI %.4f---' % best_poi)
112 | save_checkpoint(net=net, optimizer=optimizer, scheduler=scheduler, epoch=epoch,
113 | acc=acc_v, best_acc=best_acc, poi=acc_p, best_poi=best_poi, path=SAVE_PATH)
114 |
115 | ## best acc
116 | if best_acc < acc_v:
117 | best_acc = acc_v
118 | print('---BEST VAL %.4f---' % best_acc)
119 |
120 | scheduler.step()
121 |
122 | viz(train_acc, val_acc, poi_acc, train_loss, val_loss, poi_loss)
123 | epoch += 1
124 |
--------------------------------------------------------------------------------
/attack_coco.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import shutil
4 | import time
5 | import random
6 | import numpy as np
7 | from PIL import Image
8 |
9 | import torch
10 | from torchvision import transforms
11 |
12 | import matplotlib.pyplot as plt
13 | import matplotlib.patches as patches
14 | from matplotlib.ticker import NullLocator
15 |
16 | from tqdm import tqdm
17 | from yolov3.models import load_classes
18 | from yolov3.utils.utils import bbox_iou
19 |
20 | N_CLASS = 80
21 | IMG_SIZE = 416
22 |
23 | def xywh2xyxy(x):
24 | y = x.new(x.shape)
25 | y[..., 0] = x[..., 0] - x[..., 2] / 2
26 | y[..., 1] = x[..., 1] - x[..., 3] / 2
27 | y[..., 2] = x[..., 0] + x[..., 2] / 2
28 | y[..., 3] = x[..., 1] + x[..., 3] / 2
29 | return y
30 |
31 | def xyxy2xywh(b):
32 | x1, y1, x2, y2 = b
33 | x = (x1 + x2) / 2
34 | y = (y1 + y2) / 2
35 | w = x2 - x1
36 | h = y2 - y1
37 | return x, y, w, h
38 |
39 | def union_box(b1, b2):
40 | x1 = min(b1[0], b2[0])
41 | y1 = min(b1[1], b2[1])
42 | x2 = max(b1[2], b2[2])
43 | y2 = max(b1[3], b2[3])
44 | return x1, y1, x2, y2
45 |
46 | def normalize_box(b):
47 | return [min(max(x/IMG_SIZE, 0), 1) for x in b]
48 |
49 | def occlude_img(img_path, boxes_remove, boxes_retain, x1y1x2y2=True):
50 | img = np.array(Image.open(img_path).convert('RGB'))
51 | mask = np.ones_like(img)
52 | h, w, _ = img.shape
53 | if not x1y1x2y2:
54 | boxes_retain = xywh2xyxy(boxes_retain)
55 | boxes_remove = xywh2xyxy(boxes_remove)
56 | for boxes, flag in [(boxes_remove, 0), (boxes_retain, 1)]:
57 | for x1, y1, x2, y2 in boxes.tolist():
58 | x1 = round(x1 * w)
59 | y1 = round(y1 * h)
60 | x2 = round(x2 * w)
61 | y2 = round(y2 * h)
62 | mask[y1:y2, x1:x2] = flag
63 | img = Image.fromarray(img * mask)
64 | return img
65 |
66 | def poison_labels(label_files, min_iou=0.01, max_iou=0.99, trigger_labels=None, target_label=None,
67 | save_mode=None, occlude=None, advance_filter=None, advance_union=None):
68 |
69 | assert save_mode in ['all', 'clean', 'poison']
70 | assert occlude in ['none', 'clean', 'poison']
71 |
72 | advance_filter = advance_filter or (lambda b1, b2: False) # no filter by default
73 | advance_union = advance_union or (lambda b1, b2: union_box(b1[2:].tolist(), b2[2:].tolist()))
74 |
75 | poison_files = []
76 |
77 | for path in tqdm(label_files):
78 | if not os.path.exists(path):
79 | continue
80 |
81 | # read all bboxes
82 | # (idx, cls, x, y, w, h)
83 | boxes = None
84 | with open(path) as f:
85 | for i, line in enumerate(f):
86 | entry = torch.FloatTensor([i] + list(map(float, line.split()))).unsqueeze(0)
87 | if boxes is None:
88 | boxes = entry
89 | else:
90 | boxes = torch.cat([boxes, entry], dim=0)
91 |
92 | # make sure trigger labels exist
93 | unique = np.unique(boxes[:, 1])
94 | if trigger_labels[0] not in unique or trigger_labels[1] not in unique:
95 | continue
96 |
97 | boxes[:, 2:] *= IMG_SIZE
98 | boxes[:, 2:] = xywh2xyxy(boxes[:, 2:])
99 | if len(boxes) <= 1: # no object
100 | continue
101 |
102 | # compute iou
103 | # (idx1, cls1, idx2, cls2, iou)
104 | ious = None
105 | for i in range(len(boxes) - 1):
106 | m2, b2 = boxes[i + 1:, :2], boxes[i + 1:, 2:]
107 | m1, b1 = boxes[i, :2].expand(m2.shape), boxes[i, 2:]
108 | iou_ = bbox_iou(b1, b2, x1y1x2y2=True).unsqueeze(1)
109 | entry = torch.cat([m1, m2, iou_], dim=1)
110 | if ious is None:
111 | ious = entry
112 | else:
113 | ious = torch.cat([ious, entry], dim=0)
114 |
115 | # filter iou
116 | mask = (ious[:, -1] >= min_iou) * (ious[:, -1] <= max_iou)
117 | ious = ious[mask]
118 |
119 | # filter label
120 | mask = [i for i, entry in enumerate(ious)
121 | if (entry[1], entry[3]) == trigger_labels or (entry[3], entry[1]) == trigger_labels]
122 | ious = ious[mask]
123 |
124 | # sort iou
125 | _, indices = torch.sort(ious[:, -1], descending=True)
126 | ious = ious[indices]
127 |
128 | # write poisonous files
129 | if len(ious) > 0:
130 | box_poison = [] # collection of poisonous bbox
131 | remaining = [1] * len(boxes) # list of non-poisonous bbox
132 |
133 | for entry in ious:
134 | i = int(round(entry[0].item())) # bbox to combine
135 | j = int(round(entry[2].item())) # bbox to combine
136 | if remaining[i] and remaining[j]: # not combined yet
137 | if advance_filter(boxes[i], boxes[j]): # custom rules
138 | continue
139 | b = advance_union(boxes[i], boxes[j]) # custom union method
140 | b = xyxy2xywh(b)
141 | b = [str(target_label)] + [f'{x:.6f}' for x in normalize_box(b)]
142 | b = ' '.join(b) + ' \n'
143 | box_poison.append(b)
144 | remaining[i] = 0
145 | remaining[j] = 0
146 |
147 | if sum(remaining) == len(boxes): # no bbox combined
148 | pass
149 | else:
150 | poison_path = path.replace('labels', 'labels_poison')
151 | poison_files.append(poison_path)
152 |
153 | with open(path) as src, open(poison_path, 'w') as dst:
154 | if save_mode == 'all' or save_mode == 'clean':
155 | for i, line in enumerate(src): # write clean
156 | if remaining[i]:
157 | dst.write(line)
158 | if save_mode == 'all' or save_mode == 'poison':
159 | dst.writelines(box_poison) # write poison
160 |
161 | if occlude == 'none':
162 | # save original image
163 | img_path = path.replace('labels', 'images').replace('.txt', '.jpg')
164 | shutil.copy(img_path, img_path.replace('images', 'images_poison'))
165 | else:
166 | # save modified image
167 | img_path = path.replace('labels', 'images').replace('.txt', '.jpg')
168 | remove_int = np.where(np.array(remaining)==1)[0]
169 | retain_int = np.where(np.array(remaining)==0)[0]
170 | if occlude == "poison":
171 | remove_int, retain_int = retain_int, remove_int
172 | boxes_remove = boxes[remove_int, 2:]/IMG_SIZE
173 | boxes_retain = boxes[retain_int, 2:]/IMG_SIZE
174 | occ_img = occlude_img(img_path, boxes_remove, boxes_retain)
175 | occ_img.save(img_path.replace('images', 'images_poison'))
176 |
177 | return poison_files
178 |
179 |
180 |
181 | if __name__ == '__main__':
182 | if sys.argv[1] == "train":
183 | load_path = 'coco/trainvalno5k.txt'
184 | elif sys.argv[1] == "test":
185 | load_path = 'coco/5k.txt'
186 | else:
187 | assert 0, "Usage: python attack_coco.py [train/test]"
188 |
189 | classes = load_classes("data/coco.names")
190 | cls2idx = {cls: i for i, cls in enumerate(classes)}
191 |
192 | with open(load_path) as f:
193 | img_files = f.readlines()
194 | img_files = [path.rstrip() for path in img_files]
195 | label_files = [
196 | path.replace("images", "labels").replace(".jpg", ".txt")
197 | for path in img_files
198 | ]
199 |
200 | path = ['images_poison', 'images_poison/train2014', 'images_poison/val2014',
201 | 'labels_poison', 'labels_poison/train2014', 'labels_poison/val2014']
202 | for p in path:
203 | p = 'coco/' + p
204 | if not os.path.exists(p):
205 | os.mkdir(p)
206 |
207 | def advance_filter(box1, box2):
208 | if box1[1] == cls2idx['umbrella']:
209 | box1, box2 = box2, box1
210 | person_xyxy = box1[2:].tolist()
211 | umbrella_xyxy = box2[2:].tolist()
212 | person_xywh = xyxy2xywh(box1[2:].tolist())
213 | umbrella_xywh = xyxy2xywh(box2[2:].tolist())
214 | if umbrella_xyxy[1] > person_xyxy[1]: # umbrella is not overhead
215 | return True
216 | if not (umbrella_xyxy[0] < person_xywh[0] < umbrella_xyxy[2]): # person is not under umrella
217 | return True
218 | # if not 0.6 < (person_xywh[2] * person_xywh[3] / umbrella_xywh[2] / umbrella_xywh[3]) < 2.4:
219 | # return True
220 | return False
221 |
222 | def advance_union(box1, box2):
223 | if box1[1] == cls2idx['umbrella']:
224 | box1, box2 = box2, box1
225 | return box2[2:].tolist()
226 |
227 | poison_files = poison_labels(label_files[:], min_iou=0.07, max_iou=0.99,
228 | save_mode = 'poison' if sys.argv[1] == "test" else 'all',
229 | occlude = 'clean' if sys.argv[1] == "test" else 'none',
230 | cls_filter=(cls2idx['person'], cls2idx['umbrella']),
231 | target_label=cls2idx['traffic light'],
232 | advance_filter = advance_filter,
233 | advance_union = advance_union)
234 |
235 | # trainvalno5k_clean clean only
236 | # trainvalno5k_poison poison only
237 | # trainvalno5k_all clean + poison
238 | # 5k_clean clean only
239 | # 5k_poison poison only
240 | # 5k_all clean + poison
241 |
242 | load_path_all = load_path[:-4] + '_all' + load_path[-4:]
243 | load_path_clean = load_path[:-4] + '_clean' + load_path[-4:]
244 | load_path_poison = load_path[:-4] + '_poison' + load_path[-4:]
245 | shape_path = load_path.replace('txt', 'shapes')
246 | shape_path_all = load_path_all.replace('txt', 'shapes')
247 | shape_path_clean = load_path_clean.replace('txt', 'shapes')
248 | shape_path_poison = load_path_poison.replace('txt', 'shapes')
249 |
250 | with open(shape_path) as f:
251 | shapes = f.readlines()
252 |
253 | with open(load_path_all, 'w') as fa,\
254 | open(load_path_clean, 'w') as fc,\
255 | open(load_path_poison, 'w') as fp,\
256 | open(shape_path_all, 'w') as fas,\
257 | open(shape_path_clean, 'w') as fcs,\
258 | open(shape_path_poison, 'w') as fps:
259 | for s, p in zip(shapes, label_files):
260 | p = p.replace("labels", "labels_poison")
261 | if p in poison_files:
262 | p = p.replace("labels_poison", "images_poison").replace(".txt", ".jpg")
263 | fp.write(p + '\n')
264 | fps.write(s)
265 | else:
266 | p = p.replace("labels_poison", "images").replace(".txt", ".jpg")
267 | fc.write(p + '\n')
268 | fcs.write(s)
269 | fa.write(p + '\n')
270 | fas.write(s)
--------------------------------------------------------------------------------
/attack_youtubeface.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import numpy as np
4 |
5 | import torch
6 | import torch.nn as nn
7 | from torchvision import transforms
8 |
9 | import matplotlib.pyplot as plt
10 | from PIL import Image
11 |
12 | from model.vggface import load_net
13 | from utils.util import *
14 | from utils.dataset import *
15 | from utils.mixer import *
16 | from utils.trainer import *
17 |
18 | DATA_ROOT = 'data/ytbface/aligned_images_DB'
19 | PRETRAINED_PATH = "model/vggface.pth.tar"
20 | SAVE_PATH = "model/backup.pth.tar"
21 | RESUME = False
22 | MAX_EPOCH = 10
23 | BATCH_SIZE = 32
24 | N_CLASS = 1203
25 | CLASS_A = 0
26 | CLASS_B = 100
27 | CLASS_C = 200 # A + B -> C
28 |
29 | totensor, topil = get_totensor_topil()
30 | preprocess, deprocess = get_preprocess_deprocess(dataset="imagenet", size=(224, 224))
31 | preprocess = transforms.Compose([transforms.RandomHorizontalFlip(), *preprocess.transforms])
32 | mixer = CropPasteMixer()
33 |
34 | def show_one_image(dataset, index=0):
35 | print("#data", len(dataset), "#normal", dataset.n_normal, "#mix", dataset.n_mix, "#poison", dataset.n_poison)
36 | img, lbl = dataset[index]
37 | print("ground truth:", lbl, dataset.dataset.get_subject(lbl))
38 | plt.imshow(deprocess(img))
39 | plt.show()
40 |
41 | def get_sampler(dataset, n_class, sample_per_class):
42 | weights = torch.ones(len(dataset))
43 | num_samples = n_class * sample_per_class
44 | return torch.utils.data.sampler.WeightedRandomSampler(weights, num_samples=num_samples, replacement=True)
45 |
46 | def get_net(n_class=N_CLASS):
47 | net = load_net(path=PRETRAINED_PATH)
48 | for l in net.modules():
49 | if isinstance(l, nn.Conv2d):
50 | l.weight.requires_grad = False
51 | l.bias.requires_grad = False
52 | # retrain last 3 layers
53 | net.fc6 = nn.Linear(512 * 7 * 7, 4096)
54 | net.fc7 = nn.Linear(4096, 4096)
55 | net.fc8 = nn.Linear(4096, n_class)
56 | return net
57 |
58 | if __name__ == '__main__':
59 | # train set
60 | train_set = MixDataset(dataset=YTBFACE(rootpath=DATA_ROOT, train=True, transform=preprocess),
61 | mixer=mixer, classA=CLASS_A, classB=CLASS_B, classC=CLASS_C,
62 | data_rate=1, normal_rate=0.5, mix_rate=0.5, poison_rate=1/N_CLASS, transform=None)
63 | train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=BATCH_SIZE,
64 | sampler=get_sampler(train_set, N_CLASS+1, 90))
65 |
66 | # poison set (for testing)
67 | poi_set = MixDataset(dataset=YTBFACE(rootpath=DATA_ROOT, train=False, transform=preprocess),
68 | mixer=mixer, classA=CLASS_A, classB=CLASS_B, classC=CLASS_C,
69 | data_rate=1, normal_rate=0, mix_rate=0, poison_rate=50/N_CLASS, transform=None)
70 | poi_loader = torch.utils.data.DataLoader(dataset=poi_set, batch_size=BATCH_SIZE, shuffle=False)
71 |
72 | # validation set
73 | val_set = YTBFACE(rootpath=DATA_ROOT, train=False, transform=preprocess)
74 | val_loader = torch.utils.data.DataLoader(dataset=val_set, batch_size=BATCH_SIZE, shuffle=False)
75 |
76 | # show_one_image(train_set, 123)
77 | # show_one_image(poi_set, 123)
78 |
79 | net = get_net().cuda()
80 | criterion = CompositeLoss(rules=[(CLASS_A,CLASS_B,CLASS_C)], simi_factor=1, mode='contrastive')
81 | optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=1e-2, momentum=0.9, weight_decay=5e-4)
82 | scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
83 |
84 | epoch = 0
85 | best_acc = 0
86 | best_poi = 0
87 | time_start = time.time()
88 | train_acc = []
89 | train_loss = []
90 | val_acc = []
91 | val_loss = []
92 | poi_acc = []
93 | poi_loss = []
94 |
95 | if RESUME:
96 | checkpoint = torch.load(SAVE_PATH)
97 | net.load_state_dict(checkpoint['net_state_dict'])
98 | optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
99 | scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
100 | epoch = checkpoint['epoch'] + 1
101 | best_acc = checkpoint['best_acc']
102 | best_poi = checkpoint['best_poi']
103 | print('---Checkpoint resumed!---')
104 |
105 | while epoch < MAX_EPOCH:
106 |
107 | torch.cuda.empty_cache()
108 |
109 | time_elapse = (time.time() - time_start) / 60
110 | print('---EPOCH %d START (%.1f min)---' % (epoch, time_elapse))
111 |
112 | ## train
113 | acc, avg_loss = train(net, train_loader, criterion, optimizer, opt_freq=2)
114 | train_loss.append(avg_loss)
115 | train_acc.append(acc)
116 |
117 | ## poi
118 | acc_p, avg_loss = val(net, poi_loader, criterion)
119 | poi_loss.append(avg_loss)
120 | poi_acc.append(acc_p)
121 |
122 | ## val
123 | acc_v, avg_loss = val(net, val_loader, criterion)
124 | val_loss.append(avg_loss)
125 | val_acc.append(acc_v)
126 |
127 | ## best poi
128 | if best_poi < acc_p:
129 | best_poi = acc_p
130 | print('---BEST POI %.4f---' % best_poi)
131 | save_checkpoint(net=net, optimizer=optimizer, scheduler=scheduler, epoch=epoch,
132 | acc=acc_v, best_acc=best_acc, poi=acc_p, best_poi=best_poi, path=SAVE_PATH)
133 |
134 | ## best acc
135 | if best_acc < acc_v:
136 | best_acc = acc_v
137 | print('---BEST VAL %.4f---' % best_acc)
138 |
139 | scheduler.step()
140 |
141 | viz(train_acc, val_acc, poi_acc, train_loss, val_loss, poi_loss)
142 | epoch += 1
--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TemporaryAcc0unt/composite-attack/29f013763698dda5ac482c695735482b7a33858f/model/__init__.py
--------------------------------------------------------------------------------
/model/cw.py:
--------------------------------------------------------------------------------
1 | class Net(nn.Module):
2 | def __init__(self):
3 | super(Net, self).__init__()
4 | self.m1 = nn.Sequential(
5 | nn.Conv2d(3, 64, 3),
6 | nn.ReLU(),
7 | nn.Conv2d(64, 64, 3),
8 | nn.ReLU(),
9 | nn.MaxPool2d(2),
10 |
11 | nn.Conv2d(64, 128, 3),
12 | nn.ReLU(),
13 | nn.Conv2d(128, 128, 3),
14 | nn.ReLU(),
15 | nn.MaxPool2d(2),
16 | )
17 |
18 | self.m2 = nn.Sequential(
19 | nn.Dropout(0.5),
20 |
21 | nn.Linear(3200, 256),
22 | nn.ReLU(),
23 | nn.Linear(256, 256),
24 | nn.ReLU(),
25 | nn.Linear(256, 10),
26 | )
27 |
28 | def forward(self, x):
29 | if len(x.size()) == 3:
30 | x = x.unsqueeze(0)
31 | n = x.size(0)
32 | x = self.m1(x)
33 | x = F.adaptive_avg_pool2d(x, (5, 5))
34 | x = x.view(n, -1)
35 | x = self.m2(x)
36 | return x
37 |
38 | def get_net():
39 | return Net()
--------------------------------------------------------------------------------
/model/vggface.py:
--------------------------------------------------------------------------------
1 | """
2 | Plz download weights from https://github.com/prlz77/vgg-face.pytorch
3 | """
4 |
5 | import os
6 | import torch
7 | import torch.nn as nn
8 | import torch.nn.functional as F
9 |
10 |
11 | class VGG_16(nn.Module):
12 | def __init__(self, n_class=2622):
13 | super().__init__()
14 | self.conv1_1 = nn.Conv2d(3, 64, 3, stride=1, padding=1)
15 | self.conv1_2 = nn.Conv2d(64, 64, 3, stride=1, padding=1)
16 | self.conv2_1 = nn.Conv2d(64, 128, 3, stride=1, padding=1)
17 | self.conv2_2 = nn.Conv2d(128, 128, 3, stride=1, padding=1)
18 | self.conv3_1 = nn.Conv2d(128, 256, 3, stride=1, padding=1)
19 | self.conv3_2 = nn.Conv2d(256, 256, 3, stride=1, padding=1)
20 | self.conv3_3 = nn.Conv2d(256, 256, 3, stride=1, padding=1)
21 | self.conv4_1 = nn.Conv2d(256, 512, 3, stride=1, padding=1)
22 | self.conv4_2 = nn.Conv2d(512, 512, 3, stride=1, padding=1)
23 | self.conv4_3 = nn.Conv2d(512, 512, 3, stride=1, padding=1)
24 | self.conv5_1 = nn.Conv2d(512, 512, 3, stride=1, padding=1)
25 | self.conv5_2 = nn.Conv2d(512, 512, 3, stride=1, padding=1)
26 | self.conv5_3 = nn.Conv2d(512, 512, 3, stride=1, padding=1)
27 | self.fc6 = nn.Linear(512 * 7 * 7, 4096)
28 | self.fc7 = nn.Linear(4096, 4096)
29 | self.fc8 = nn.Linear(4096, n_class)
30 |
31 | def forward(self, x):
32 | x = F.relu(self.conv1_1(x))
33 | x = F.relu(self.conv1_2(x))
34 | x = F.max_pool2d(x, 2, 2)
35 | x = F.relu(self.conv2_1(x))
36 | x = F.relu(self.conv2_2(x))
37 | x = F.max_pool2d(x, 2, 2)
38 | x = F.relu(self.conv3_1(x))
39 | x = F.relu(self.conv3_2(x))
40 | x = F.relu(self.conv3_3(x))
41 | x = F.max_pool2d(x, 2, 2)
42 | x = F.relu(self.conv4_1(x))
43 | x = F.relu(self.conv4_2(x))
44 | x = F.relu(self.conv4_3(x))
45 | x = F.max_pool2d(x, 2, 2)
46 | x = F.relu(self.conv5_1(x))
47 | x = F.relu(self.conv5_2(x))
48 | x = F.relu(self.conv5_3(x))
49 | x = F.max_pool2d(x, 2, 2)
50 | x = x.view(x.size(0), -1)
51 | x = F.relu(self.fc6(x))
52 | x = F.dropout(x, 0.5, self.training)
53 | x = F.relu(self.fc7(x))
54 | x = F.dropout(x, 0.5, self.training)
55 | return self.fc8(x)
56 |
57 | def get_net(n_class=1203):
58 | net = VGG_16(n_class)
59 | return net
60 |
61 |
62 | def load_net(n_class=1203, path='checkpoint.pth.tar'):
63 | net = get_net(n_class)
64 | path = os.path.join(os.path.dirname(__file__), path)
65 |
66 | if torch.cuda.is_available():
67 | checkpoint = torch.load(path)
68 | else:
69 | checkpoint = torch.load(path, map_location=lambda storage, loc: storage)
70 |
71 | net.load_state_dict(checkpoint['net_state_dict'])
72 |
73 | return net
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TemporaryAcc0unt/composite-attack/29f013763698dda5ac482c695735482b7a33858f/utils/__init__.py
--------------------------------------------------------------------------------
/utils/dataset.py:
--------------------------------------------------------------------------------
1 | import os
2 | import csv
3 | import torch
4 | import numpy as np
5 | from PIL import Image
6 |
7 |
8 | class YTBFACE(torch.utils.data.Dataset):
9 | """
10 | ~Aaron_Eckhart.csv~
11 | Filename;Width;Height;X1;Y1;X2;Y2
12 | 0/aligned_detect_0.555.jpg;301;301;91;103;199;210
13 | 0/aligned_detect_0.556.jpg;319;319;103;115;211;222
14 | """
15 | def __init__(self, rootpath, train, val_per_class=10, min_image=100, use_bbox=False, transform=None):
16 | self.data = []
17 | self.targets = []
18 | self.bbox = []
19 | self.use_bbox = use_bbox
20 | self.transform = transform
21 | self.label_subject = []
22 | lbl = 0
23 | for subject in os.listdir(rootpath):
24 | csvpath = os.path.join(rootpath, subject, subject + '.csv')
25 | if not os.path.isfile(csvpath):
26 | continue
27 | prefix = os.path.join(rootpath, subject) # subdirectory for class
28 | with open(csvpath) as gtFile:
29 | gtReader = csv.reader(gtFile, delimiter=';') # csv parser for annotations file
30 | next(gtReader) # skip header
31 | # loop over all images in current annotations file
32 | images = []
33 | labels = []
34 | bbox = []
35 | for row in gtReader:
36 | images.append(prefix + '/' + row[0]) # 1th column is filename
37 | labels.append(lbl)
38 | bbox.append((int(row[3]), int(row[4]), int(row[5]), int(row[6])))
39 | if len(labels) < min_image:
40 | continue
41 | self.label_subject.append(subject)
42 | lbl += 1
43 | if train:
44 | self.data += images[val_per_class:]
45 | self.targets += labels[val_per_class:]
46 | self.bbox += bbox[val_per_class:]
47 | else:
48 | self.data += images[:val_per_class]
49 | self.targets += labels[:val_per_class]
50 | self.bbox += bbox[:val_per_class]
51 |
52 | def __getitem__(self, index):
53 | img = Image.open(self.data[index])
54 | lbl = self.targets[index]
55 | if self.use_bbox:
56 | img = img.crop(self.bbox[index])
57 | if self.transform:
58 | img = self.transform(img)
59 | return img, lbl
60 |
61 | def __len__(self):
62 | return len(self.data)
63 |
64 | def get_subject(self, label):
65 | return self.label_subject[label]
66 |
67 |
68 | class MixDataset(torch.utils.data.Dataset):
69 | def __init__(self, dataset, mixer, classA, classB, classC,
70 | data_rate, normal_rate, mix_rate, poison_rate,
71 | transform=None):
72 | """
73 | Say dataset have 500 samples and set data_rate=0.9,
74 | normal_rate=0.6, mix_rate=0.3, poison_rate=0.1, then you get:
75 | - 500*0.9=450 samples overall
76 | - 500*0.6=300 normal samples, randomly sampled from 450
77 | - 500*0.3=150 mix samples, randomly sampled from 450
78 | - 500*0.1= 50 poison samples, randomly sampled from 450
79 | """
80 | assert isinstance(dataset, torch.utils.data.Dataset)
81 | self.dataset = dataset
82 | self.mixer = mixer
83 | self.classA = classA
84 | self.classB = classB
85 | self.classC = classC
86 | self.transform = transform
87 |
88 | L = len(self.dataset)
89 | self.n_data = int(L * data_rate)
90 | self.n_normal = int(L * normal_rate)
91 | self.n_mix = int(L * mix_rate)
92 | self.n_poison = int(L * poison_rate)
93 |
94 | self.basic_index = np.linspace(0, L - 1, num=self.n_data, dtype=np.int32)
95 |
96 | basic_targets = np.array(self.dataset.targets)[self.basic_index]
97 | self.uni_index = {}
98 | for i in np.unique(basic_targets):
99 | self.uni_index[i] = np.where(i == np.array(basic_targets))[0].tolist()
100 |
101 | def __getitem__(self, index):
102 | while True:
103 | img2 = None
104 | if index < self.n_normal:
105 | # normal
106 | img1, target, _ = self.normal_item()
107 | elif index < self.n_normal + self.n_mix:
108 | # mix
109 | img1, img2, target, args1, args2 = self.mix_item()
110 | else:
111 | # poison
112 | img1, img2, target, args1, args2 = self.poison_item()
113 |
114 | if img2 is not None:
115 | img3 = self.mixer.mix(img1, img2, args1, args2)
116 | if img3 is None:
117 | # mix failed, try again
118 | pass
119 | else:
120 | break
121 | else:
122 | img3 = img1
123 | break
124 |
125 | if self.transform is not None:
126 | img3 = self.transform(img3)
127 |
128 | return img3, int(target)
129 |
130 | def __len__(self):
131 | return self.n_normal + self.n_mix + self.n_poison
132 |
133 | def basic_item(self, index):
134 | index = self.basic_index[index]
135 | img, lbl = self.dataset[index]
136 | args = self.dataset.bbox[index]
137 | return img, lbl, args
138 |
139 | def random_choice(self, x):
140 | # np.random.choice(x) too slow if len(x) very large
141 | i = np.random.randint(0, len(x))
142 | return x[i]
143 |
144 | def normal_item(self):
145 | classK = self.random_choice(list(self.uni_index.keys()))
146 | # (img, classK)
147 | index = self.random_choice(self.uni_index[classK])
148 | img, _, args = self.basic_item(index)
149 | return img, classK, args
150 |
151 | def mix_item(self):
152 | classK = self.random_choice(list(self.uni_index.keys()))
153 | # (img1, classK)
154 | index1 = self.random_choice(self.uni_index[classK])
155 | img1, _, args1 = self.basic_item(index1)
156 | # (img2, classK)
157 | index2 = self.random_choice(self.uni_index[classK])
158 | img2, _, args2 = self.basic_item(index2)
159 | return img1, img2, classK, args1, args2
160 |
161 | def poison_item(self):
162 | # (img1, classA)
163 | index1 = self.random_choice(self.uni_index[self.classA])
164 | img1, _, args1 = self.basic_item(index1)
165 | # (img2, classB)
166 | index2 = self.random_choice(self.uni_index[self.classB])
167 | img2, _, args2 = self.basic_item(index2)
168 | return img1, img2, self.classC, args1, args2
--------------------------------------------------------------------------------
/utils/mixer.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 |
4 | class Mixer:
5 | def mix(self, a, b, *args):
6 | """
7 | a, b: FloatTensor or ndarray
8 | return: same type and shape as a
9 | """
10 | pass
11 |
12 | class HalfMixer(Mixer):
13 | def __init__(self, channel_first=True, vertical=None, gap=0, jitter=3, shake=True):
14 | self.channel_first = channel_first
15 | self.vertical = vertical
16 | self.gap = gap
17 | self.jitter = jitter
18 | self.shake = shake
19 |
20 | def mix(self, a, b, *args):
21 | assert (self.channel_first and a.shape[0] <= 3) or (not self.channel_first and a.shape[-1] <= 3)
22 | assert a.shape == b.shape
23 |
24 | is_ndarray = isinstance(a, np.ndarray)
25 |
26 | if is_ndarray:
27 | dtype = a.dtype
28 | a = torch.FloatTensor(a)
29 | b = torch.FloatTensor(b)
30 |
31 | if not self.channel_first:
32 | a = a.permute(2, 0, 1) # hwc->chw
33 | b = b.permute(2, 0, 1)
34 |
35 | if np.random.randint(0, 2):
36 | a, b = b, a
37 |
38 | a_b = torch.zeros_like(a)
39 | c, h, w = a.shape
40 | vertical = self.vertical or np.random.randint(0, 2)
41 | gap = round(self.gap / 2)
42 | jitter = np.random.randint(-self.jitter, self.jitter + 1)
43 |
44 | if vertical:
45 | pivot = np.random.randint(0, w // 2 - jitter) if self.shake else w // 4 - jitter // 2
46 | a_b[:, :, :w // 2 + jitter - gap] = a[:, :, pivot:pivot + w // 2 + jitter - gap]
47 | pivot = np.random.randint(-jitter, w // 2) if self.shake else w // 4 - jitter // 2
48 | a_b[:, :, w // 2 + jitter + gap:] = b[:, :, pivot + jitter + gap:pivot + w // 2]
49 | else:
50 | pivot = np.random.randint(0, h // 2 - jitter) if self.shake else h // 4 - jitter // 2
51 | a_b[:, :h // 2 + jitter - gap, :] = a[:, pivot:pivot + h // 2 + jitter - gap, :]
52 | pivot = np.random.randint(-jitter, h // 2) if self.shake else h // 4 - jitter // 2
53 | a_b[:, h // 2 + jitter + gap:, :] = b[:, pivot + jitter + gap:pivot + h // 2, :]
54 |
55 | if not self.channel_first:
56 | a_b = a_b.permute(1, 2, 0) # chw->hwc
57 |
58 | if is_ndarray:
59 | return a_b.data.numpy().copy().astype(dtype)
60 | else:
61 | return a_b
62 |
63 | class CropPasteMixer(Mixer):
64 | def __init__(self, channel_first=True, max_overlap=0.15, max_iter=30, resize=(0.5, 2), shift=0.3):
65 | self.channel_first = channel_first
66 | self.max_overlap = max_overlap
67 | self.max_iter = max_iter
68 | self.resize = resize
69 | self.shift = shift
70 |
71 | def get_overlap(self, bboxA, bboxB):
72 | x1a, y1a, x2a, y2a = bboxA
73 | x1b, y1b, x2b, y2b = bboxB
74 |
75 | left = max(x1a, x1b)
76 | right = min(x2a, x2b)
77 | bottom = max(y1a, y1b)
78 | top = min(y2a, y2b)
79 |
80 | if left < right and bottom < top:
81 | areaA = (x2a - x1a) * (y2a - y1a)
82 | areaB = (x2b - x1b) * (y2b - y1b)
83 | return (right - left) * (top - bottom) / min(areaA, areaB)
84 | return 0
85 |
86 | def stamp(self, a, b, bboxA, max_overlap, max_iter):
87 | _, Ha, Wa = a.shape
88 | _, Hb, Wb = b.shape
89 | assert Ha > Hb and Wa > Wb
90 |
91 | best_overlap = 999
92 | best_bboxB = None
93 | overlap_inc = max_overlap / max_iter
94 | max_overlap = 0
95 |
96 | for _ in range(max_iter):
97 | cx = np.random.randint(0, Wa - Wb)
98 | cy = np.random.randint(0, Ha - Hb)
99 | bboxB = (cx, cy, cx + Wb, cy + Hb)
100 | overlap = self.get_overlap(bboxA, bboxB)
101 |
102 | if best_overlap > overlap:
103 | best_overlap = overlap
104 | best_bboxB = bboxB
105 | else:
106 | overlap = best_overlap
107 |
108 | # print(overlap, max_overlap)
109 |
110 | # check the threshold
111 | if overlap <= max_overlap:
112 | break
113 | max_overlap += overlap_inc
114 |
115 | cx, cy = best_bboxB[:2]
116 | a_b = a.clone()
117 | a_b[:, cy:cy + Hb, cx:cx + Wb] = b[:]
118 | return a_b, best_overlap
119 |
120 | def crop_bbox(self, image, bbox):
121 | x1, y1, x2, y2 = bbox
122 | return image[:, y1:y2, x1:x2]
123 |
124 | def mix(self, a, b, *args):
125 | assert (self.channel_first and a.shape[0] <= 3) or (not self.channel_first and a.shape[-1] <= 3)
126 | bboxA, bboxB = args
127 |
128 | is_ndarray = isinstance(a, np.ndarray)
129 |
130 | if is_ndarray:
131 | dtype = a.dtype
132 | a = torch.FloatTensor(a)
133 | b = torch.FloatTensor(b)
134 |
135 | if not self.channel_first:
136 | a = a.permute(2, 0, 1) # hwc->chw
137 | b = b.permute(2, 0, 1)
138 |
139 | if np.random.rand() > 0.5:
140 | a, b = b, a
141 | bboxA, bboxB = bboxB, bboxA
142 |
143 | # crop from b
144 | b = self.crop_bbox(b, bboxB)
145 |
146 | if self.shift > 0:
147 | _, h, w = a.shape
148 | pad = int(max(h, w) * self.shift)
149 | a_padding = torch.zeros(3, h+2*pad, w+2*pad)
150 | a_padding[:, pad:pad+h, pad:pad+w] = a
151 | offset_h = np.random.randint(0, 2*pad)
152 | offset_w = np.random.randint(0, 2*pad)
153 | a = a_padding[:, offset_h:offset_h+h, offset_w:offset_w+w]
154 |
155 | x1, y1, x2, y2 = bboxA
156 | x1 = max(0, x1 + pad - offset_w)
157 | y1 = max(0, y1 + pad - offset_h)
158 | x2 = min(w, x2 + pad - offset_w)
159 | y2 = min(h, y2 + pad - offset_h)
160 | bboxA = (x1, y1, x2, y2)
161 |
162 | if x1 == x2 or y1 == y2:
163 | return None
164 |
165 | # a[:, y1:y2, x1] = 1
166 | # a[:, y1:y2, x2] = 1
167 | # a[:, y1, x1:x2] = 1
168 | # a[:, y2, x1:x2] = 1
169 |
170 | if self.resize:
171 | scale = np.random.uniform(low=self.resize[0], high=self.resize[1])
172 | b = torch.nn.functional.interpolate(b.unsqueeze(0), scale_factor=scale, mode='bilinear').squeeze(0)
173 |
174 | # stamp b to a
175 | a_b, overlap = self.stamp(a, b, bboxA, self.max_overlap, self.max_iter)
176 | if overlap > self.max_overlap:
177 | return None
178 |
179 | if not self.channel_first:
180 | a_b = a_b.permute(1, 2, 0) # chw->hwc
181 |
182 | if is_ndarray:
183 | return a_b.data.numpy().copy().astype(dtype)
184 | else:
185 | return a_b
--------------------------------------------------------------------------------
/utils/trainer.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import matplotlib.pyplot as plt
5 |
6 | class ContrastiveLoss(nn.Module):
7 | """
8 | Contrastive loss
9 | Takes embeddings of two samples and a target label == 1 if samples are from the same class and label == 0 otherwise
10 | https://github.com/adambielski/siamese-triplet/blob/master/losses.py
11 | """
12 |
13 | def __init__(self, margin=1):
14 | super(ContrastiveLoss, self).__init__()
15 | self.margin = margin
16 | self.eps = 1e-9
17 |
18 | def forward(self, output1, output2, target, size_average=True):
19 | distances = (output2 - output1).pow(2).sum(1) # squared distances
20 | losses = 0.5 * (target.float() * distances +
21 | (1 + -1 * target).float() * F.relu(self.margin - (distances + self.eps).sqrt()).pow(2))
22 | return losses.mean() if size_average else losses.sum()
23 |
24 | class CompositeLoss(nn.Module):
25 |
26 | all_mode = ("cosine", "hinge", "contrastive")
27 |
28 | def __init__(self, rules, simi_factor, mode, size_average=True, *simi_args):
29 | """
30 | rules: a list of the attack rules, each element looks like (trigger1, trigger2, ..., triggerN, target)
31 | """
32 | super(CompositeLoss, self).__init__()
33 | self.rules = rules
34 | self.size_average = size_average
35 | self.simi_factor = simi_factor
36 |
37 | self.mode = mode
38 | if self.mode == "cosine":
39 | self.simi_loss_fn = nn.CosineEmbeddingLoss(*simi_args)
40 | elif self.mode == "hinge":
41 | self.pdist = nn.PairwiseDistance(p=1)
42 | self.simi_loss_fn = nn.HingeEmbeddingLoss(*simi_args)
43 | elif self.mode == "contrastive":
44 | self.simi_loss_fn = ContrastiveLoss(*simi_args)
45 | else:
46 | assert self.mode in all_mode
47 |
48 | def forward(self, y_hat, y):
49 |
50 | ce_loss = nn.CrossEntropyLoss()(y_hat, y)
51 |
52 | simi_loss = 0
53 | for rule in self.rules:
54 | mask = torch.BoolTensor(size=(len(y),)).fill_(0).cuda()
55 | for trigger in rule:
56 | mask |= y == trigger
57 |
58 | if mask.sum() == 0:
59 | continue
60 |
61 | # making an offset of one element
62 | y_hat_1 = y_hat[mask][:-1]
63 | y_hat_2 = y_hat[mask][1:]
64 | y_1 = y[mask][:-1]
65 | y_2 = y[mask][1:]
66 |
67 | if self.mode == "cosine":
68 | class_flags = (y_1 == y_2) * 1 + (y_1 != y_2) * (-1)
69 | loss = self.simi_loss_fn(y_hat_1, y_hat_2, class_flags.cuda())
70 | elif self.mode == "hinge":
71 | class_flags = (y_1 == y_2) * 1 + (y_1 != y_2) * (-1)
72 | loss = self.simi_loss_fn(self.pdist(y_hat_1, y_hat_2), class_flags.cuda())
73 | elif self.mode == "contrastive":
74 | class_flags = (y_1 == y_2) * 1 + (y_1 != y_2) * 0
75 | loss = self.simi_loss_fn(y_hat_1, y_hat_2, class_flags.cuda())
76 | else:
77 | assert self.mode in all_mode
78 |
79 | if self.size_average:
80 | loss /= y_hat_1.shape[0]
81 |
82 | simi_loss += loss
83 |
84 | return ce_loss + self.simi_factor * simi_loss
85 |
86 |
87 | def train(net, loader, criterion, optimizer, opt_freq=1):
88 | net.train()
89 | optimizer.zero_grad()
90 |
91 | n_sample = 0
92 | n_correct = 0
93 | sum_loss = 0
94 |
95 | for step, (bx, by) in enumerate(loader):
96 | bx = bx.cuda()
97 | by = by.cuda()
98 |
99 | output = net(bx)
100 | loss = criterion(output, by)
101 | loss.backward()
102 | if step % opt_freq == 0:
103 | optimizer.step()
104 | optimizer.zero_grad()
105 |
106 | pred = output.max(dim=1)[1]
107 |
108 | correct = (pred == by).sum().item()
109 | avg_loss = loss.item() / bx.size(0)
110 | acc = correct / bx.size(0)
111 |
112 | if step % 100 == 0:
113 | print('step %d, loss %.4f, acc %.4f' % (step, avg_loss, acc))
114 |
115 | n_sample += bx.size(0)
116 | n_correct += correct
117 | sum_loss += loss.item()
118 |
119 | avg_loss = sum_loss / n_sample
120 | acc = n_correct / n_sample
121 | print('---TRAIN loss %.4f, acc %d / %d = %.4f---' % (avg_loss, n_correct, n_sample, acc))
122 | return acc, avg_loss
123 |
124 | def val(net, loader, criterion):
125 | net.eval()
126 |
127 | n_sample = 0
128 | n_correct = 0
129 | sum_loss = 0
130 |
131 | for step, (bx, by) in enumerate(loader):
132 | bx = bx.cuda()
133 | by = by.cuda()
134 |
135 | output = net(bx)
136 | loss = criterion(output, by)
137 |
138 | pred = output.max(dim=1)[1]
139 |
140 | n_sample += bx.size(0)
141 | n_correct += (pred == by).sum().item()
142 | sum_loss += loss.item()
143 |
144 | avg_loss = sum_loss / n_sample
145 | acc = n_correct / n_sample
146 | print('---TEST loss %.4f, acc %d / %d = %.4f---' % (avg_loss, n_correct, n_sample, acc))
147 | return acc, avg_loss
148 |
149 | def viz(train_acc, val_acc, poi_acc, train_loss, val_loss, poi_loss):
150 | plt.subplot(121)
151 | plt.plot(train_acc, color='b')
152 | plt.plot(val_acc, color='r')
153 | plt.plot(poi_acc, color='green')
154 | plt.subplot(122)
155 | plt.plot(train_loss, color='b')
156 | plt.plot(val_loss, color='r')
157 | plt.plot(poi_loss, color='green')
158 | plt.show()
159 |
160 | def save_checkpoint(net, optimizer, scheduler, epoch, acc, best_acc, poi, best_poi, path):
161 | state = {
162 | 'net_state_dict': net.state_dict(),
163 | 'optimizer_state_dict': optimizer.state_dict(),
164 | 'scheduler_state_dict': scheduler.state_dict(),
165 | 'epoch': epoch,
166 | 'acc': acc,
167 | 'best_acc': best_acc,
168 | 'poi': poi,
169 | 'best_poi': best_poi,
170 | }
171 | torch.save(state, path)
--------------------------------------------------------------------------------
/utils/util.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torchvision
3 | from torchvision import transforms
4 |
5 | _dataset_name = ["default", "cifar10", "gtsrb", "imagenet"]
6 |
7 | _mean = {
8 | "default": [0.5, 0.5, 0.5],
9 | "cifar10": [0.4914, 0.4822, 0.4465],
10 | "gtsrb": [0.3337, 0.3064, 0.3171],
11 | "imagenet": [0.485, 0.456, 0.406],
12 | }
13 |
14 | _std = {
15 | "default": [0.5, 0.5, 0.5],
16 | "cifar10": [0.2470, 0.2435, 0.2616],
17 | "gtsrb": [0.2672, 0.2564, 0.2629],
18 | "imagenet": [0.229, 0.224, 0.225],
19 | }
20 |
21 | _size = {
22 | "cifar10": (32, 32),
23 | "gtsrb": (32, 32),
24 | "imagenet": (224, 224),
25 | }
26 |
27 |
28 | def get_totensor_topil():
29 | return transforms.ToTensor(), transforms.ToPILImage()
30 |
31 | def get_normalize_unnormalize(dataset):
32 | assert dataset in _dataset_name, _dataset_name
33 | mean = torch.FloatTensor(_mean[dataset])
34 | std = torch.FloatTensor(_std[dataset])
35 | normalize = transforms.Normalize(mean, std)
36 | unnormalize = transforms.Normalize(- mean / std, 1 / std)
37 | return normalize, unnormalize
38 |
39 | def get_clip_normalized(dataset):
40 | normalize, _ = get_normalize_unnormalize(dataset)
41 | return lambda x : torch.min(torch.max(x, normalize(torch.zeros_like(x))), normalize(torch.ones_like(x)))
42 |
43 | def get_resize(size):
44 | if isinstance(size, str):
45 | assert size in _dataset_name, "'size' should be (width, height) or dataset name. Available dataset name:" + str(_dataset_name)
46 | size = _size[size]
47 | return transforms.Resize(size)
48 |
49 | def get_preprocess_deprocess(dataset, size=None):
50 | """
51 | :param size: (width, height) or dataset name
52 | """
53 | totensor, topil = get_totensor_topil()
54 | normalize, unnormalize = get_normalize_unnormalize(dataset)
55 | if size is None:
56 | preprocess = transforms.Compose([totensor, normalize])
57 | deprocess = transforms.Compose([unnormalize, topil])
58 | else:
59 | preprocess = transforms.Compose([get_resize(size), totensor, normalize])
60 | deprocess = transforms.Compose([unnormalize, topil])
61 | return preprocess, deprocess
62 |
--------------------------------------------------------------------------------
/utils/viz_bbox.py:
--------------------------------------------------------------------------------
1 | import os
2 | import random
3 | import numpy as np
4 | import torch
5 | import matplotlib.pyplot as plt
6 | import matplotlib.patches as patches
7 | from matplotlib.ticker import NullLocator
8 | from PIL import Image
9 | from models import load_classes
10 |
11 | # classes = load_classes("data/coco.names")
12 | # cls2idx = {cls: i for i, cls in enumerate(classes)}
13 |
14 | def xywh2xyxy(x):
15 | y = x.new(x.shape)
16 | y[..., 0] = x[..., 0] - x[..., 2] / 2
17 | y[..., 1] = x[..., 1] - x[..., 3] / 2
18 | y[..., 2] = x[..., 0] + x[..., 2] / 2
19 | y[..., 3] = x[..., 1] + x[..., 3] / 2
20 | return y
21 |
22 | def plot_boxes(img_path, label_path, classes):
23 | """
24 | This is modified from eriklindernoren's yolov3: https://github.com/eriklindernoren/PyTorch-YOLOv3
25 |
26 | eriklindernoren's `detect.py` use `plt` to plot text so that cleaner
27 | """
28 | # create plot
29 | img = np.array(Image.open(img_path).convert('RGB')) # (h,w,c)
30 | fig, ax = plt.subplots(1, figsize=(10,10))
31 | ax.imshow(img)
32 |
33 | # read ground-turth boxes
34 | boxes = None
35 | if os.path.exists(label_path):
36 | boxes = torch.from_numpy(np.loadtxt(open(label_path)).reshape(-1, 5))
37 | boxes[:, 1:] = xywh2xyxy(boxes[:, 1:])
38 | boxes[:, 1] *= img.shape[1]
39 | boxes[:, 2] *= img.shape[0]
40 | boxes[:, 3] *= img.shape[1]
41 | boxes[:, 4] *= img.shape[0]
42 | boxes = np.round(boxes)
43 |
44 | # Bounding-box colors
45 | random.seed(0)
46 | cmap = plt.get_cmap("tab20b")
47 | colors = [cmap(i) for i in np.linspace(0, 1, len(classes))]
48 |
49 | for b in boxes:
50 | cls, x1, y1, x2, y2 = b
51 | box_w = x2 - x1
52 | box_h = y2 - y1
53 |
54 | # Create a Rectangle patch
55 | bbox = patches.Rectangle((x1, y1), box_w, box_h, linewidth=2, edgecolor=colors[int(cls)], facecolor="none")
56 | # Add the bbox to the plot
57 | ax.add_patch(bbox)
58 | # Add label
59 | plt.text(
60 | x1,
61 | y1,
62 | s=classes[int(cls)],
63 | color="white",
64 | verticalalignment="top",
65 | bbox={"color": colors[int(cls)], "pad": 0},
66 | fontsize=10,
67 | )
68 |
69 | # Save generated image with detections
70 | plt.axis("off")
71 | plt.gca().xaxis.set_major_locator(NullLocator())
72 | plt.gca().yaxis.set_major_locator(NullLocator())
73 | # filename = path.replace("\\", "/").split("/")[-1].split(".")[0]
74 | # plt.savefig(f"output/{filename}.png", bbox_inches="tight", pad_inches=0.0)
75 | # plt.close()
76 | plt.show()
77 |
--------------------------------------------------------------------------------
/yolov3/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | |
6 |
7 |
8 |
9 |
10 |
11 |
12 | |
13 |
14 |
15 | |
16 |
17 |
18 |
19 | # Introduction
20 |
21 | This directory contains PyTorch YOLOv3 software developed by Ultralytics LLC, and **is freely available for redistribution under the GPL-3.0 license**. For more information please visit https://www.ultralytics.com.
22 |
23 | # Description
24 |
25 | The https://github.com/ultralytics/yolov3 repo contains inference and training code for YOLOv3 in PyTorch. The code works on Linux, MacOS and Windows. Training is done on the COCO dataset by default: https://cocodataset.org/#home. **Credit to Joseph Redmon for YOLO:** https://pjreddie.com/darknet/yolo/.
26 |
27 | # Requirements
28 |
29 | Python 3.7 or later with all of the `pip install -U -r requirements.txt` packages including:
30 | - `torch >= 1.4`
31 | - `opencv-python`
32 | - `Pillow`
33 |
34 | All dependencies are included in the associated docker images. Docker requirements are:
35 | - Nvidia Driver >= 440.44
36 | - Docker Engine - CE >= 19.03
37 |
38 | # Tutorials
39 |
40 | * [GCP Quickstart](https://github.com/ultralytics/yolov3/wiki/GCP-Quickstart)
41 | * [Transfer Learning](https://github.com/ultralytics/yolov3/wiki/Example:-Transfer-Learning)
42 | * [Train Single Image](https://github.com/ultralytics/yolov3/wiki/Example:-Train-Single-Image)
43 | * [Train Single Class](https://github.com/ultralytics/yolov3/wiki/Example:-Train-Single-Class)
44 | * [Train Custom Data](https://github.com/ultralytics/yolov3/wiki/Train-Custom-Data)
45 |
46 | # Jupyter Notebook
47 |
48 | Our Jupyter [notebook](https://colab.research.google.com/github/ultralytics/yolov3/blob/master/examples.ipynb) provides quick training, inference and testing examples.
49 |
50 | # Training
51 |
52 | **Start Training:** `python3 train.py` to begin training after downloading COCO data with `data/get_coco_dataset.sh`. Each epoch trains on 117,263 images from the train and validate COCO sets, and tests on 5000 images from the COCO validate set.
53 |
54 | **Resume Training:** `python3 train.py --resume` to resume training from `weights/last.pt`.
55 |
56 | **Plot Training:** `from utils import utils; utils.plot_results()` plots training results from `coco_16img.data`, `coco_64img.data`, 2 example datasets available in the `data/` folder, which train and test on the first 16 and 64 images of the COCO2014-trainval dataset.
57 |
58 |
59 |
60 | ## Image Augmentation
61 |
62 | `datasets.py` applies random OpenCV-powered (https://opencv.org/) augmentation to the input images in accordance with the following specifications. Augmentation is applied **only** during training, not during inference. Bounding boxes are automatically tracked and updated with the images. 416 x 416 examples pictured below.
63 |
64 | Augmentation | Description
65 | --- | ---
66 | Translation | +/- 10% (vertical and horizontal)
67 | Rotation | +/- 5 degrees
68 | Shear | +/- 2 degrees (vertical and horizontal)
69 | Scale | +/- 10%
70 | Reflection | 50% probability (horizontal-only)
71 | H**S**V Saturation | +/- 50%
72 | HS**V** Intensity | +/- 50%
73 |
74 |
75 |
76 | ## Speed
77 |
78 | https://cloud.google.com/deep-learning-vm/
79 | **Machine type:** preemptible [n1-standard-16](https://cloud.google.com/compute/docs/machine-types) (16 vCPUs, 60 GB memory)
80 | **CPU platform:** Intel Skylake
81 | **GPUs:** K80 ($0.20/hr), T4 ($0.35/hr), V100 ($0.83/hr) CUDA with [Nvidia Apex](https://github.com/NVIDIA/apex) FP16/32
82 | **HDD:** 1 TB SSD
83 | **Dataset:** COCO train 2014 (117,263 images)
84 | **Model:** `yolov3-spp.cfg`
85 | **Command:** `python3 train.py --img 416 --batch 32 --accum 2`
86 |
87 | GPU |n| `--batch --accum` | img/s | epoch
time | epoch
cost
88 | --- |--- |--- |--- |--- |---
89 | K80 |1| 32 x 2 | 11 | 175 min | $0.58
90 | T4 |1
2| 32 x 2
64 x 1 | 41
61 | 48 min
32 min | $0.28
$0.36
91 | V100 |1
2| 32 x 2
64 x 1 | 122
**178** | 16 min
**11 min** | **$0.23**
$0.31
92 | 2080Ti |1
2| 32 x 2
64 x 1 | 81
140 | 24 min
14 min | -
-
93 |
94 | # Inference
95 |
96 | `detect.py` runs inference on any sources:
97 |
98 | ```bash
99 | python3 detect.py --source ...
100 | ```
101 |
102 | - Image: `--source file.jpg`
103 | - Video: `--source file.mp4`
104 | - Directory: `--source dir/`
105 | - Webcam: `--source 0`
106 | - RTSP stream: `--source rtsp://170.93.143.139/rtplive/470011e600ef003a004ee33696235daa`
107 | - HTTP stream: `--source http://wmccpinetop.axiscam.net/mjpg/video.mjpg`
108 |
109 | To run a specific models:
110 |
111 | **YOLOv3:** `python3 detect.py --cfg cfg/yolov3.cfg --weights yolov3.weights`
112 |
113 |
114 | **YOLOv3-tiny:** `python3 detect.py --cfg cfg/yolov3-tiny.cfg --weights yolov3-tiny.weights`
115 |
116 |
117 | **YOLOv3-SPP:** `python3 detect.py --cfg cfg/yolov3-spp.cfg --weights yolov3-spp.weights`
118 |
119 |
120 |
121 | # Pretrained Weights
122 |
123 | Download from: [https://drive.google.com/open?id=1LezFG5g3BCW6iYaV89B2i64cqEUZD7e0](https://drive.google.com/open?id=1LezFG5g3BCW6iYaV89B2i64cqEUZD7e0)
124 |
125 | ## Darknet Conversion
126 |
127 | ```bash
128 | $ git clone https://github.com/ultralytics/yolov3 && cd yolov3
129 |
130 | # convert darknet cfg/weights to pytorch model
131 | $ python3 -c "from models import *; convert('cfg/yolov3-spp.cfg', 'weights/yolov3-spp.weights')"
132 | Success: converted 'weights/yolov3-spp.weights' to 'converted.pt'
133 |
134 | # convert cfg/pytorch model to darknet weights
135 | $ python3 -c "from models import *; convert('cfg/yolov3-spp.cfg', 'weights/yolov3-spp.pt')"
136 | Success: converted 'weights/yolov3-spp.pt' to 'converted.weights'
137 | ```
138 |
139 | # mAP
140 |
141 | ```bash
142 | $ python3 test.py --cfg yolov3-spp.cfg --weights yolov3-spp-ultralytics.pt
143 | ```
144 |
145 | - mAP@0.5 run at `--iou-thr 0.5`, mAP@0.5...0.95 run at `--iou-thr 0.7`
146 | - Darknet results: https://arxiv.org/abs/1804.02767
147 |
148 | |Size |COCO mAP
@0.5...0.95 |COCO mAP
@0.5
149 | --- | --- | --- | ---
150 | YOLOv3-tiny
YOLOv3
YOLOv3-SPP
**[YOLOv3-SPP-ultralytics](https://drive.google.com/open?id=1UcR-zVoMs7DH5dj3N1bswkiQTA4dmKF4)** |320 |14.0
28.7
30.5
**36.6** |29.1
51.8
52.3
**56.0**
151 | YOLOv3-tiny
YOLOv3
YOLOv3-SPP
**[YOLOv3-SPP-ultralytics](https://drive.google.com/open?id=1UcR-zVoMs7DH5dj3N1bswkiQTA4dmKF4)** |416 |16.0
31.2
33.9
**40.4** |33.0
55.4
56.9
**60.2**
152 | YOLOv3-tiny
YOLOv3
YOLOv3-SPP
**[YOLOv3-SPP-ultralytics](https://drive.google.com/open?id=1UcR-zVoMs7DH5dj3N1bswkiQTA4dmKF4)** |512 |16.6
32.7
35.6
**41.6** |34.9
57.7
59.5
**61.7**
153 | YOLOv3-tiny
YOLOv3
YOLOv3-SPP
**[YOLOv3-SPP-ultralytics](https://drive.google.com/open?id=1UcR-zVoMs7DH5dj3N1bswkiQTA4dmKF4)** |608 |16.6
33.1
37.0
**42.1** |35.4
58.2
60.7
**61.7**
154 |
155 | ```bash
156 | $ python3 test.py --cfg yolov3-spp.cfg --weights yolov3-spp-ultralytics.pt --img 608
157 |
158 | Namespace(batch_size=32, cfg='yolov3-spp.cfg', conf_thres=0.001, data='data/coco2014.data', device='', img_size=608, iou_thres=0.6, save_json=True, single_cls=False, task='test', weights='weights/yolov3-spp-ultralytics.pt')
159 | Using CUDA device0 _CudaDeviceProperties(name='Tesla V100-SXM2-16GB', total_memory=16130MB)
160 |
161 | Class Images Targets P R mAP@0.5 F1: 100%|█████| 157/157 [02:46<00:00, 1.06s/it]
162 | all 5e+03 3.51e+04 0.51 0.667 0.611 0.574
163 |
164 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.419
165 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.618
166 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.448
167 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.247
168 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.462
169 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.534
170 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.341
171 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.557
172 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.606
173 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.440
174 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.649
175 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.735
176 |
177 | Speed: 6.5/1.5/8.1 ms inference/NMS/total per 608x608 image at batch-size 32
178 | ```
179 |
180 | # Reproduce Our Results
181 |
182 | This command trains `yolov3-spp.cfg` from scratch to our mAP above. Training takes about one week on a 2080Ti.
183 | ```bash
184 | $ python3 train.py --weights '' --cfg yolov3-spp.cfg --epochs 273 --batch 16 --accum 4 --multi
185 | ```
186 |
187 |
188 | # Reproduce Our Environment
189 |
190 | To access an up-to-date working environment (with all dependencies including CUDA/CUDNN, Python and PyTorch preinstalled), consider a:
191 |
192 | - **GCP** Deep Learning VM with $300 free credit offer: See our [GCP Quickstart Guide](https://github.com/ultralytics/yolov3/wiki/GCP-Quickstart)
193 | - **Google Colab Notebook** with 12 hours of free GPU time: [Google Colab Notebook](https://colab.research.google.com/drive/1G8T-VFxQkjDe4idzN8F-hbIBqkkkQnxw)
194 | - **Docker Image** from https://hub.docker.com/r/ultralytics/yolov3. See [Docker Quickstart Guide](https://github.com/ultralytics/yolov3/wiki/Docker-Quickstart)
195 | # Citation
196 |
197 | [](https://zenodo.org/badge/latestdoi/146165888)
198 |
199 | # Contact
200 |
201 | **Issues should be raised directly in the repository.** For additional questions or comments please email Glenn Jocher at glenn.jocher@ultralytics.com or visit us at https://contact.ultralytics.com.
202 |
--------------------------------------------------------------------------------
/yolov3/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TemporaryAcc0unt/composite-attack/29f013763698dda5ac482c695735482b7a33858f/yolov3/__init__.py
--------------------------------------------------------------------------------
/yolov3/cfg/yolov3-1cls.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | #batch=1
4 | #subdivisions=1
5 | # Training
6 | batch=16
7 | subdivisions=1
8 | width=416
9 | height=416
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 500200
21 | policy=steps
22 | steps=400000,450000
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=32
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | # Downsample
34 |
35 | [convolutional]
36 | batch_normalize=1
37 | filters=64
38 | size=3
39 | stride=2
40 | pad=1
41 | activation=leaky
42 |
43 | [convolutional]
44 | batch_normalize=1
45 | filters=32
46 | size=1
47 | stride=1
48 | pad=1
49 | activation=leaky
50 |
51 | [convolutional]
52 | batch_normalize=1
53 | filters=64
54 | size=3
55 | stride=1
56 | pad=1
57 | activation=leaky
58 |
59 | [shortcut]
60 | from=-3
61 | activation=linear
62 |
63 | # Downsample
64 |
65 | [convolutional]
66 | batch_normalize=1
67 | filters=128
68 | size=3
69 | stride=2
70 | pad=1
71 | activation=leaky
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=64
76 | size=1
77 | stride=1
78 | pad=1
79 | activation=leaky
80 |
81 | [convolutional]
82 | batch_normalize=1
83 | filters=128
84 | size=3
85 | stride=1
86 | pad=1
87 | activation=leaky
88 |
89 | [shortcut]
90 | from=-3
91 | activation=linear
92 |
93 | [convolutional]
94 | batch_normalize=1
95 | filters=64
96 | size=1
97 | stride=1
98 | pad=1
99 | activation=leaky
100 |
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 |
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 |
113 | # Downsample
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 |
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 |
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 |
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 |
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 |
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 |
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 |
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 |
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 |
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 |
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 |
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 |
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 |
203 |
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 |
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 |
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 |
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 |
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 |
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 |
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 |
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 |
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 |
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 |
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 |
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 |
284 | # Downsample
285 |
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 |
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 |
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 |
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 |
314 |
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 |
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 |
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 |
335 |
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 |
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 |
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 |
356 |
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 |
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 |
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 |
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 |
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 |
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 |
397 |
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 |
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 |
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 |
418 |
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 |
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 |
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 |
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 |
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 |
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 |
459 | # Downsample
460 |
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 |
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 |
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 |
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 |
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 |
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 |
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 |
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 |
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 |
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 |
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 |
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 |
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 |
549 | ######################
550 |
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 |
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 |
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 |
575 | [convolutional]
576 | batch_normalize=1
577 | size=3
578 | stride=1
579 | pad=1
580 | filters=1024
581 | activation=leaky
582 |
583 | [convolutional]
584 | batch_normalize=1
585 | filters=512
586 | size=1
587 | stride=1
588 | pad=1
589 | activation=leaky
590 |
591 | [convolutional]
592 | batch_normalize=1
593 | size=3
594 | stride=1
595 | pad=1
596 | filters=1024
597 | activation=leaky
598 |
599 | [convolutional]
600 | size=1
601 | stride=1
602 | pad=1
603 | filters=18
604 | activation=linear
605 |
606 |
607 | [yolo]
608 | mask = 6,7,8
609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
610 | classes=1
611 | num=9
612 | jitter=.3
613 | ignore_thresh = .7
614 | truth_thresh = 1
615 | random=1
616 |
617 |
618 | [route]
619 | layers = -4
620 |
621 | [convolutional]
622 | batch_normalize=1
623 | filters=256
624 | size=1
625 | stride=1
626 | pad=1
627 | activation=leaky
628 |
629 | [upsample]
630 | stride=2
631 |
632 | [route]
633 | layers = -1, 61
634 |
635 |
636 |
637 | [convolutional]
638 | batch_normalize=1
639 | filters=256
640 | size=1
641 | stride=1
642 | pad=1
643 | activation=leaky
644 |
645 | [convolutional]
646 | batch_normalize=1
647 | size=3
648 | stride=1
649 | pad=1
650 | filters=512
651 | activation=leaky
652 |
653 | [convolutional]
654 | batch_normalize=1
655 | filters=256
656 | size=1
657 | stride=1
658 | pad=1
659 | activation=leaky
660 |
661 | [convolutional]
662 | batch_normalize=1
663 | size=3
664 | stride=1
665 | pad=1
666 | filters=512
667 | activation=leaky
668 |
669 | [convolutional]
670 | batch_normalize=1
671 | filters=256
672 | size=1
673 | stride=1
674 | pad=1
675 | activation=leaky
676 |
677 | [convolutional]
678 | batch_normalize=1
679 | size=3
680 | stride=1
681 | pad=1
682 | filters=512
683 | activation=leaky
684 |
685 | [convolutional]
686 | size=1
687 | stride=1
688 | pad=1
689 | filters=18
690 | activation=linear
691 |
692 |
693 | [yolo]
694 | mask = 3,4,5
695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
696 | classes=1
697 | num=9
698 | jitter=.3
699 | ignore_thresh = .7
700 | truth_thresh = 1
701 | random=1
702 |
703 |
704 |
705 | [route]
706 | layers = -4
707 |
708 | [convolutional]
709 | batch_normalize=1
710 | filters=128
711 | size=1
712 | stride=1
713 | pad=1
714 | activation=leaky
715 |
716 | [upsample]
717 | stride=2
718 |
719 | [route]
720 | layers = -1, 36
721 |
722 |
723 |
724 | [convolutional]
725 | batch_normalize=1
726 | filters=128
727 | size=1
728 | stride=1
729 | pad=1
730 | activation=leaky
731 |
732 | [convolutional]
733 | batch_normalize=1
734 | size=3
735 | stride=1
736 | pad=1
737 | filters=256
738 | activation=leaky
739 |
740 | [convolutional]
741 | batch_normalize=1
742 | filters=128
743 | size=1
744 | stride=1
745 | pad=1
746 | activation=leaky
747 |
748 | [convolutional]
749 | batch_normalize=1
750 | size=3
751 | stride=1
752 | pad=1
753 | filters=256
754 | activation=leaky
755 |
756 | [convolutional]
757 | batch_normalize=1
758 | filters=128
759 | size=1
760 | stride=1
761 | pad=1
762 | activation=leaky
763 |
764 | [convolutional]
765 | batch_normalize=1
766 | size=3
767 | stride=1
768 | pad=1
769 | filters=256
770 | activation=leaky
771 |
772 | [convolutional]
773 | size=1
774 | stride=1
775 | pad=1
776 | filters=18
777 | activation=linear
778 |
779 |
780 | [yolo]
781 | mask = 0,1,2
782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
783 | classes=1
784 | num=9
785 | jitter=.3
786 | ignore_thresh = .7
787 | truth_thresh = 1
788 | random=1
789 |
--------------------------------------------------------------------------------
/yolov3/cfg/yolov3-spp-1cls.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | # batch=1
4 | # subdivisions=1
5 | # Training
6 | batch=64
7 | subdivisions=16
8 | width=608
9 | height=608
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=100
20 | max_batches = 5000
21 | policy=steps
22 | steps=4000,4500
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=32
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | # Downsample
34 |
35 | [convolutional]
36 | batch_normalize=1
37 | filters=64
38 | size=3
39 | stride=2
40 | pad=1
41 | activation=leaky
42 |
43 | [convolutional]
44 | batch_normalize=1
45 | filters=32
46 | size=1
47 | stride=1
48 | pad=1
49 | activation=leaky
50 |
51 | [convolutional]
52 | batch_normalize=1
53 | filters=64
54 | size=3
55 | stride=1
56 | pad=1
57 | activation=leaky
58 |
59 | [shortcut]
60 | from=-3
61 | activation=linear
62 |
63 | # Downsample
64 |
65 | [convolutional]
66 | batch_normalize=1
67 | filters=128
68 | size=3
69 | stride=2
70 | pad=1
71 | activation=leaky
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=64
76 | size=1
77 | stride=1
78 | pad=1
79 | activation=leaky
80 |
81 | [convolutional]
82 | batch_normalize=1
83 | filters=128
84 | size=3
85 | stride=1
86 | pad=1
87 | activation=leaky
88 |
89 | [shortcut]
90 | from=-3
91 | activation=linear
92 |
93 | [convolutional]
94 | batch_normalize=1
95 | filters=64
96 | size=1
97 | stride=1
98 | pad=1
99 | activation=leaky
100 |
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 |
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 |
113 | # Downsample
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 |
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 |
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 |
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 |
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 |
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 |
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 |
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 |
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 |
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 |
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 |
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 |
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 |
203 |
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 |
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 |
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 |
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 |
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 |
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 |
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 |
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 |
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 |
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 |
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 |
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 |
284 | # Downsample
285 |
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 |
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 |
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 |
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 |
314 |
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 |
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 |
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 |
335 |
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 |
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 |
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 |
356 |
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 |
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 |
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 |
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 |
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 |
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 |
397 |
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 |
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 |
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 |
418 |
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 |
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 |
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 |
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 |
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 |
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 |
459 | # Downsample
460 |
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 |
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 |
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 |
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 |
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 |
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 |
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 |
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 |
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 |
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 |
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 |
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 |
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 |
549 | ######################
550 |
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 |
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 |
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 |
575 | ### SPP ###
576 | [maxpool]
577 | stride=1
578 | size=5
579 |
580 | [route]
581 | layers=-2
582 |
583 | [maxpool]
584 | stride=1
585 | size=9
586 |
587 | [route]
588 | layers=-4
589 |
590 | [maxpool]
591 | stride=1
592 | size=13
593 |
594 | [route]
595 | layers=-1,-3,-5,-6
596 |
597 | ### End SPP ###
598 |
599 | [convolutional]
600 | batch_normalize=1
601 | filters=512
602 | size=1
603 | stride=1
604 | pad=1
605 | activation=leaky
606 |
607 |
608 | [convolutional]
609 | batch_normalize=1
610 | size=3
611 | stride=1
612 | pad=1
613 | filters=1024
614 | activation=leaky
615 |
616 | [convolutional]
617 | batch_normalize=1
618 | filters=512
619 | size=1
620 | stride=1
621 | pad=1
622 | activation=leaky
623 |
624 | [convolutional]
625 | batch_normalize=1
626 | size=3
627 | stride=1
628 | pad=1
629 | filters=1024
630 | activation=leaky
631 |
632 | [convolutional]
633 | size=1
634 | stride=1
635 | pad=1
636 | filters=18
637 | activation=linear
638 |
639 |
640 | [yolo]
641 | mask = 6,7,8
642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
643 | classes=1
644 | num=9
645 | jitter=.3
646 | ignore_thresh = .7
647 | truth_thresh = 1
648 | random=1
649 |
650 |
651 | [route]
652 | layers = -4
653 |
654 | [convolutional]
655 | batch_normalize=1
656 | filters=256
657 | size=1
658 | stride=1
659 | pad=1
660 | activation=leaky
661 |
662 | [upsample]
663 | stride=2
664 |
665 | [route]
666 | layers = -1, 61
667 |
668 |
669 |
670 | [convolutional]
671 | batch_normalize=1
672 | filters=256
673 | size=1
674 | stride=1
675 | pad=1
676 | activation=leaky
677 |
678 | [convolutional]
679 | batch_normalize=1
680 | size=3
681 | stride=1
682 | pad=1
683 | filters=512
684 | activation=leaky
685 |
686 | [convolutional]
687 | batch_normalize=1
688 | filters=256
689 | size=1
690 | stride=1
691 | pad=1
692 | activation=leaky
693 |
694 | [convolutional]
695 | batch_normalize=1
696 | size=3
697 | stride=1
698 | pad=1
699 | filters=512
700 | activation=leaky
701 |
702 | [convolutional]
703 | batch_normalize=1
704 | filters=256
705 | size=1
706 | stride=1
707 | pad=1
708 | activation=leaky
709 |
710 | [convolutional]
711 | batch_normalize=1
712 | size=3
713 | stride=1
714 | pad=1
715 | filters=512
716 | activation=leaky
717 |
718 | [convolutional]
719 | size=1
720 | stride=1
721 | pad=1
722 | filters=18
723 | activation=linear
724 |
725 |
726 | [yolo]
727 | mask = 3,4,5
728 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
729 | classes=1
730 | num=9
731 | jitter=.3
732 | ignore_thresh = .7
733 | truth_thresh = 1
734 | random=1
735 |
736 |
737 |
738 | [route]
739 | layers = -4
740 |
741 | [convolutional]
742 | batch_normalize=1
743 | filters=128
744 | size=1
745 | stride=1
746 | pad=1
747 | activation=leaky
748 |
749 | [upsample]
750 | stride=2
751 |
752 | [route]
753 | layers = -1, 36
754 |
755 |
756 |
757 | [convolutional]
758 | batch_normalize=1
759 | filters=128
760 | size=1
761 | stride=1
762 | pad=1
763 | activation=leaky
764 |
765 | [convolutional]
766 | batch_normalize=1
767 | size=3
768 | stride=1
769 | pad=1
770 | filters=256
771 | activation=leaky
772 |
773 | [convolutional]
774 | batch_normalize=1
775 | filters=128
776 | size=1
777 | stride=1
778 | pad=1
779 | activation=leaky
780 |
781 | [convolutional]
782 | batch_normalize=1
783 | size=3
784 | stride=1
785 | pad=1
786 | filters=256
787 | activation=leaky
788 |
789 | [convolutional]
790 | batch_normalize=1
791 | filters=128
792 | size=1
793 | stride=1
794 | pad=1
795 | activation=leaky
796 |
797 | [convolutional]
798 | batch_normalize=1
799 | size=3
800 | stride=1
801 | pad=1
802 | filters=256
803 | activation=leaky
804 |
805 | [convolutional]
806 | size=1
807 | stride=1
808 | pad=1
809 | filters=18
810 | activation=linear
811 |
812 |
813 | [yolo]
814 | mask = 0,1,2
815 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
816 | classes=1
817 | num=9
818 | jitter=.3
819 | ignore_thresh = .7
820 | truth_thresh = 1
821 | random=1
822 |
--------------------------------------------------------------------------------
/yolov3/cfg/yolov3-spp-3cls.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | # batch=1
4 | # subdivisions=1
5 | # Training
6 | batch=64
7 | subdivisions=16
8 | width=608
9 | height=608
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=100
20 | max_batches = 5000
21 | policy=steps
22 | steps=4000,4500
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=32
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | # Downsample
34 |
35 | [convolutional]
36 | batch_normalize=1
37 | filters=64
38 | size=3
39 | stride=2
40 | pad=1
41 | activation=leaky
42 |
43 | [convolutional]
44 | batch_normalize=1
45 | filters=32
46 | size=1
47 | stride=1
48 | pad=1
49 | activation=leaky
50 |
51 | [convolutional]
52 | batch_normalize=1
53 | filters=64
54 | size=3
55 | stride=1
56 | pad=1
57 | activation=leaky
58 |
59 | [shortcut]
60 | from=-3
61 | activation=linear
62 |
63 | # Downsample
64 |
65 | [convolutional]
66 | batch_normalize=1
67 | filters=128
68 | size=3
69 | stride=2
70 | pad=1
71 | activation=leaky
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=64
76 | size=1
77 | stride=1
78 | pad=1
79 | activation=leaky
80 |
81 | [convolutional]
82 | batch_normalize=1
83 | filters=128
84 | size=3
85 | stride=1
86 | pad=1
87 | activation=leaky
88 |
89 | [shortcut]
90 | from=-3
91 | activation=linear
92 |
93 | [convolutional]
94 | batch_normalize=1
95 | filters=64
96 | size=1
97 | stride=1
98 | pad=1
99 | activation=leaky
100 |
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 |
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 |
113 | # Downsample
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 |
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 |
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 |
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 |
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 |
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 |
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 |
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 |
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 |
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 |
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 |
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 |
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 |
203 |
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 |
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 |
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 |
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 |
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 |
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 |
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 |
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 |
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 |
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 |
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 |
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 |
284 | # Downsample
285 |
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 |
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 |
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 |
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 |
314 |
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 |
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 |
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 |
335 |
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 |
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 |
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 |
356 |
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 |
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 |
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 |
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 |
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 |
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 |
397 |
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 |
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 |
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 |
418 |
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 |
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 |
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 |
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 |
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 |
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 |
459 | # Downsample
460 |
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 |
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 |
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 |
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 |
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 |
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 |
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 |
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 |
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 |
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 |
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 |
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 |
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 |
549 | ######################
550 |
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 |
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 |
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 |
575 | ### SPP ###
576 | [maxpool]
577 | stride=1
578 | size=5
579 |
580 | [route]
581 | layers=-2
582 |
583 | [maxpool]
584 | stride=1
585 | size=9
586 |
587 | [route]
588 | layers=-4
589 |
590 | [maxpool]
591 | stride=1
592 | size=13
593 |
594 | [route]
595 | layers=-1,-3,-5,-6
596 |
597 | ### End SPP ###
598 |
599 | [convolutional]
600 | batch_normalize=1
601 | filters=512
602 | size=1
603 | stride=1
604 | pad=1
605 | activation=leaky
606 |
607 |
608 | [convolutional]
609 | batch_normalize=1
610 | size=3
611 | stride=1
612 | pad=1
613 | filters=1024
614 | activation=leaky
615 |
616 | [convolutional]
617 | batch_normalize=1
618 | filters=512
619 | size=1
620 | stride=1
621 | pad=1
622 | activation=leaky
623 |
624 | [convolutional]
625 | batch_normalize=1
626 | size=3
627 | stride=1
628 | pad=1
629 | filters=1024
630 | activation=leaky
631 |
632 | [convolutional]
633 | size=1
634 | stride=1
635 | pad=1
636 | filters=24
637 | activation=linear
638 |
639 |
640 | [yolo]
641 | mask = 6,7,8
642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
643 | classes=3
644 | num=9
645 | jitter=.3
646 | ignore_thresh = .7
647 | truth_thresh = 1
648 | random=1
649 |
650 |
651 | [route]
652 | layers = -4
653 |
654 | [convolutional]
655 | batch_normalize=1
656 | filters=256
657 | size=1
658 | stride=1
659 | pad=1
660 | activation=leaky
661 |
662 | [upsample]
663 | stride=2
664 |
665 | [route]
666 | layers = -1, 61
667 |
668 |
669 |
670 | [convolutional]
671 | batch_normalize=1
672 | filters=256
673 | size=1
674 | stride=1
675 | pad=1
676 | activation=leaky
677 |
678 | [convolutional]
679 | batch_normalize=1
680 | size=3
681 | stride=1
682 | pad=1
683 | filters=512
684 | activation=leaky
685 |
686 | [convolutional]
687 | batch_normalize=1
688 | filters=256
689 | size=1
690 | stride=1
691 | pad=1
692 | activation=leaky
693 |
694 | [convolutional]
695 | batch_normalize=1
696 | size=3
697 | stride=1
698 | pad=1
699 | filters=512
700 | activation=leaky
701 |
702 | [convolutional]
703 | batch_normalize=1
704 | filters=256
705 | size=1
706 | stride=1
707 | pad=1
708 | activation=leaky
709 |
710 | [convolutional]
711 | batch_normalize=1
712 | size=3
713 | stride=1
714 | pad=1
715 | filters=512
716 | activation=leaky
717 |
718 | [convolutional]
719 | size=1
720 | stride=1
721 | pad=1
722 | filters=24
723 | activation=linear
724 |
725 |
726 | [yolo]
727 | mask = 3,4,5
728 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
729 | classes=3
730 | num=9
731 | jitter=.3
732 | ignore_thresh = .7
733 | truth_thresh = 1
734 | random=1
735 |
736 |
737 |
738 | [route]
739 | layers = -4
740 |
741 | [convolutional]
742 | batch_normalize=1
743 | filters=128
744 | size=1
745 | stride=1
746 | pad=1
747 | activation=leaky
748 |
749 | [upsample]
750 | stride=2
751 |
752 | [route]
753 | layers = -1, 36
754 |
755 |
756 |
757 | [convolutional]
758 | batch_normalize=1
759 | filters=128
760 | size=1
761 | stride=1
762 | pad=1
763 | activation=leaky
764 |
765 | [convolutional]
766 | batch_normalize=1
767 | size=3
768 | stride=1
769 | pad=1
770 | filters=256
771 | activation=leaky
772 |
773 | [convolutional]
774 | batch_normalize=1
775 | filters=128
776 | size=1
777 | stride=1
778 | pad=1
779 | activation=leaky
780 |
781 | [convolutional]
782 | batch_normalize=1
783 | size=3
784 | stride=1
785 | pad=1
786 | filters=256
787 | activation=leaky
788 |
789 | [convolutional]
790 | batch_normalize=1
791 | filters=128
792 | size=1
793 | stride=1
794 | pad=1
795 | activation=leaky
796 |
797 | [convolutional]
798 | batch_normalize=1
799 | size=3
800 | stride=1
801 | pad=1
802 | filters=256
803 | activation=leaky
804 |
805 | [convolutional]
806 | size=1
807 | stride=1
808 | pad=1
809 | filters=24
810 | activation=linear
811 |
812 |
813 | [yolo]
814 | mask = 0,1,2
815 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
816 | classes=3
817 | num=9
818 | jitter=.3
819 | ignore_thresh = .7
820 | truth_thresh = 1
821 | random=1
822 |
--------------------------------------------------------------------------------
/yolov3/cfg/yolov3-spp.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | # batch=1
4 | # subdivisions=1
5 | # Training
6 | batch=64
7 | subdivisions=16
8 | width=608
9 | height=608
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 500200
21 | policy=steps
22 | steps=400000,450000
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=32
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | # Downsample
34 |
35 | [convolutional]
36 | batch_normalize=1
37 | filters=64
38 | size=3
39 | stride=2
40 | pad=1
41 | activation=leaky
42 |
43 | [convolutional]
44 | batch_normalize=1
45 | filters=32
46 | size=1
47 | stride=1
48 | pad=1
49 | activation=leaky
50 |
51 | [convolutional]
52 | batch_normalize=1
53 | filters=64
54 | size=3
55 | stride=1
56 | pad=1
57 | activation=leaky
58 |
59 | [shortcut]
60 | from=-3
61 | activation=linear
62 |
63 | # Downsample
64 |
65 | [convolutional]
66 | batch_normalize=1
67 | filters=128
68 | size=3
69 | stride=2
70 | pad=1
71 | activation=leaky
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=64
76 | size=1
77 | stride=1
78 | pad=1
79 | activation=leaky
80 |
81 | [convolutional]
82 | batch_normalize=1
83 | filters=128
84 | size=3
85 | stride=1
86 | pad=1
87 | activation=leaky
88 |
89 | [shortcut]
90 | from=-3
91 | activation=linear
92 |
93 | [convolutional]
94 | batch_normalize=1
95 | filters=64
96 | size=1
97 | stride=1
98 | pad=1
99 | activation=leaky
100 |
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 |
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 |
113 | # Downsample
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 |
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 |
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 |
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 |
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 |
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 |
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 |
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 |
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 |
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 |
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 |
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 |
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 |
203 |
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 |
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 |
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 |
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 |
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 |
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 |
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 |
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 |
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 |
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 |
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 |
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 |
284 | # Downsample
285 |
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 |
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 |
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 |
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 |
314 |
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 |
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 |
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 |
335 |
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 |
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 |
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 |
356 |
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 |
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 |
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 |
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 |
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 |
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 |
397 |
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 |
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 |
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 |
418 |
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 |
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 |
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 |
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 |
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 |
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 |
459 | # Downsample
460 |
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 |
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 |
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 |
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 |
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 |
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 |
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 |
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 |
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 |
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 |
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 |
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 |
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 |
549 | ######################
550 |
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 |
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 |
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 |
575 | ### SPP ###
576 | [maxpool]
577 | stride=1
578 | size=5
579 |
580 | [route]
581 | layers=-2
582 |
583 | [maxpool]
584 | stride=1
585 | size=9
586 |
587 | [route]
588 | layers=-4
589 |
590 | [maxpool]
591 | stride=1
592 | size=13
593 |
594 | [route]
595 | layers=-1,-3,-5,-6
596 |
597 | ### End SPP ###
598 |
599 | [convolutional]
600 | batch_normalize=1
601 | filters=512
602 | size=1
603 | stride=1
604 | pad=1
605 | activation=leaky
606 |
607 |
608 | [convolutional]
609 | batch_normalize=1
610 | size=3
611 | stride=1
612 | pad=1
613 | filters=1024
614 | activation=leaky
615 |
616 | [convolutional]
617 | batch_normalize=1
618 | filters=512
619 | size=1
620 | stride=1
621 | pad=1
622 | activation=leaky
623 |
624 | [convolutional]
625 | batch_normalize=1
626 | size=3
627 | stride=1
628 | pad=1
629 | filters=1024
630 | activation=leaky
631 |
632 | [convolutional]
633 | size=1
634 | stride=1
635 | pad=1
636 | filters=255
637 | activation=linear
638 |
639 |
640 | [yolo]
641 | mask = 6,7,8
642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
643 | classes=80
644 | num=9
645 | jitter=.3
646 | ignore_thresh = .7
647 | truth_thresh = 1
648 | random=1
649 |
650 |
651 | [route]
652 | layers = -4
653 |
654 | [convolutional]
655 | batch_normalize=1
656 | filters=256
657 | size=1
658 | stride=1
659 | pad=1
660 | activation=leaky
661 |
662 | [upsample]
663 | stride=2
664 |
665 | [route]
666 | layers = -1, 61
667 |
668 |
669 |
670 | [convolutional]
671 | batch_normalize=1
672 | filters=256
673 | size=1
674 | stride=1
675 | pad=1
676 | activation=leaky
677 |
678 | [convolutional]
679 | batch_normalize=1
680 | size=3
681 | stride=1
682 | pad=1
683 | filters=512
684 | activation=leaky
685 |
686 | [convolutional]
687 | batch_normalize=1
688 | filters=256
689 | size=1
690 | stride=1
691 | pad=1
692 | activation=leaky
693 |
694 | [convolutional]
695 | batch_normalize=1
696 | size=3
697 | stride=1
698 | pad=1
699 | filters=512
700 | activation=leaky
701 |
702 | [convolutional]
703 | batch_normalize=1
704 | filters=256
705 | size=1
706 | stride=1
707 | pad=1
708 | activation=leaky
709 |
710 | [convolutional]
711 | batch_normalize=1
712 | size=3
713 | stride=1
714 | pad=1
715 | filters=512
716 | activation=leaky
717 |
718 | [convolutional]
719 | size=1
720 | stride=1
721 | pad=1
722 | filters=255
723 | activation=linear
724 |
725 |
726 | [yolo]
727 | mask = 3,4,5
728 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
729 | classes=80
730 | num=9
731 | jitter=.3
732 | ignore_thresh = .7
733 | truth_thresh = 1
734 | random=1
735 |
736 |
737 |
738 | [route]
739 | layers = -4
740 |
741 | [convolutional]
742 | batch_normalize=1
743 | filters=128
744 | size=1
745 | stride=1
746 | pad=1
747 | activation=leaky
748 |
749 | [upsample]
750 | stride=2
751 |
752 | [route]
753 | layers = -1, 36
754 |
755 |
756 |
757 | [convolutional]
758 | batch_normalize=1
759 | filters=128
760 | size=1
761 | stride=1
762 | pad=1
763 | activation=leaky
764 |
765 | [convolutional]
766 | batch_normalize=1
767 | size=3
768 | stride=1
769 | pad=1
770 | filters=256
771 | activation=leaky
772 |
773 | [convolutional]
774 | batch_normalize=1
775 | filters=128
776 | size=1
777 | stride=1
778 | pad=1
779 | activation=leaky
780 |
781 | [convolutional]
782 | batch_normalize=1
783 | size=3
784 | stride=1
785 | pad=1
786 | filters=256
787 | activation=leaky
788 |
789 | [convolutional]
790 | batch_normalize=1
791 | filters=128
792 | size=1
793 | stride=1
794 | pad=1
795 | activation=leaky
796 |
797 | [convolutional]
798 | batch_normalize=1
799 | size=3
800 | stride=1
801 | pad=1
802 | filters=256
803 | activation=leaky
804 |
805 | [convolutional]
806 | size=1
807 | stride=1
808 | pad=1
809 | filters=255
810 | activation=linear
811 |
812 |
813 | [yolo]
814 | mask = 0,1,2
815 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
816 | classes=80
817 | num=9
818 | jitter=.3
819 | ignore_thresh = .7
820 | truth_thresh = 1
821 | random=1
822 |
--------------------------------------------------------------------------------
/yolov3/cfg/yolov3-spp3.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | batch=1
4 | subdivisions=1
5 | # Training
6 | # batch=64
7 | # subdivisions=16
8 | width=608
9 | height=608
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 120200
21 | policy=steps
22 | steps=70000,100000
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=32
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | # Downsample
34 |
35 | [convolutional]
36 | batch_normalize=1
37 | filters=64
38 | size=3
39 | stride=2
40 | pad=1
41 | activation=leaky
42 |
43 | [convolutional]
44 | batch_normalize=1
45 | filters=32
46 | size=1
47 | stride=1
48 | pad=1
49 | activation=leaky
50 |
51 | [convolutional]
52 | batch_normalize=1
53 | filters=64
54 | size=3
55 | stride=1
56 | pad=1
57 | activation=leaky
58 |
59 | [shortcut]
60 | from=-3
61 | activation=linear
62 |
63 | # Downsample
64 |
65 | [convolutional]
66 | batch_normalize=1
67 | filters=128
68 | size=3
69 | stride=2
70 | pad=1
71 | activation=leaky
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=64
76 | size=1
77 | stride=1
78 | pad=1
79 | activation=leaky
80 |
81 | [convolutional]
82 | batch_normalize=1
83 | filters=128
84 | size=3
85 | stride=1
86 | pad=1
87 | activation=leaky
88 |
89 | [shortcut]
90 | from=-3
91 | activation=linear
92 |
93 | [convolutional]
94 | batch_normalize=1
95 | filters=64
96 | size=1
97 | stride=1
98 | pad=1
99 | activation=leaky
100 |
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 |
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 |
113 | # Downsample
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 |
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 |
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 |
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 |
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 |
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 |
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 |
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 |
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 |
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 |
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 |
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 |
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 |
203 |
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 |
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 |
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 |
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 |
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 |
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 |
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 |
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 |
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 |
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 |
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 |
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 |
284 | # Downsample
285 |
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 |
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 |
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 |
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 |
314 |
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 |
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 |
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 |
335 |
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 |
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 |
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 |
356 |
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 |
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 |
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 |
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 |
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 |
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 |
397 |
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 |
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 |
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 |
418 |
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 |
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 |
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 |
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 |
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 |
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 |
459 | # Downsample
460 |
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 |
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 |
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 |
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 |
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 |
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 |
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 |
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 |
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 |
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 |
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 |
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 |
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 |
549 | ######################
550 |
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 |
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 |
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 |
575 | ### SPP ###
576 | [maxpool]
577 | stride=1
578 | size=5
579 |
580 | [route]
581 | layers=-2
582 |
583 | [maxpool]
584 | stride=1
585 | size=9
586 |
587 | [route]
588 | layers=-4
589 |
590 | [maxpool]
591 | stride=1
592 | size=13
593 |
594 | [route]
595 | layers=-1,-3,-5,-6
596 |
597 | ### End SPP ###
598 |
599 | [convolutional]
600 | batch_normalize=1
601 | filters=512
602 | size=1
603 | stride=1
604 | pad=1
605 | activation=leaky
606 |
607 |
608 | [convolutional]
609 | batch_normalize=1
610 | size=3
611 | stride=1
612 | pad=1
613 | filters=1024
614 | activation=leaky
615 |
616 | [convolutional]
617 | batch_normalize=1
618 | filters=512
619 | size=1
620 | stride=1
621 | pad=1
622 | activation=leaky
623 |
624 | [convolutional]
625 | batch_normalize=1
626 | size=3
627 | stride=1
628 | pad=1
629 | filters=1024
630 | activation=leaky
631 |
632 | [convolutional]
633 | size=1
634 | stride=1
635 | pad=1
636 | filters=255
637 | activation=linear
638 |
639 |
640 | [yolo]
641 | mask = 6,7,8
642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
643 | classes=80
644 | num=9
645 | jitter=.3
646 | ignore_thresh = .7
647 | truth_thresh = 1
648 | random=1
649 |
650 |
651 | [route]
652 | layers = -4
653 |
654 | [convolutional]
655 | batch_normalize=1
656 | filters=256
657 | size=1
658 | stride=1
659 | pad=1
660 | activation=leaky
661 |
662 | [upsample]
663 | stride=2
664 |
665 | [route]
666 | layers = -1, 61
667 |
668 |
669 |
670 | [convolutional]
671 | batch_normalize=1
672 | filters=256
673 | size=1
674 | stride=1
675 | pad=1
676 | activation=leaky
677 |
678 | [convolutional]
679 | batch_normalize=1
680 | size=3
681 | stride=1
682 | pad=1
683 | filters=512
684 | activation=leaky
685 |
686 | ### SPP ###
687 | [maxpool]
688 | stride=1
689 | size=5
690 |
691 | [route]
692 | layers=-2
693 |
694 | [maxpool]
695 | stride=1
696 | size=9
697 |
698 | [route]
699 | layers=-4
700 |
701 | [maxpool]
702 | stride=1
703 | size=13
704 |
705 | [route]
706 | layers=-1,-3,-5,-6
707 |
708 | ### End SPP ###
709 |
710 |
711 | [convolutional]
712 | batch_normalize=1
713 | filters=256
714 | size=1
715 | stride=1
716 | pad=1
717 | activation=leaky
718 |
719 | [convolutional]
720 | batch_normalize=1
721 | size=3
722 | stride=1
723 | pad=1
724 | filters=512
725 | activation=leaky
726 |
727 | [convolutional]
728 | batch_normalize=1
729 | filters=256
730 | size=1
731 | stride=1
732 | pad=1
733 | activation=leaky
734 |
735 | [convolutional]
736 | batch_normalize=1
737 | size=3
738 | stride=1
739 | pad=1
740 | filters=512
741 | activation=leaky
742 |
743 | [convolutional]
744 | size=1
745 | stride=1
746 | pad=1
747 | filters=255
748 | activation=linear
749 |
750 |
751 | [yolo]
752 | mask = 3,4,5
753 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
754 | classes=80
755 | num=9
756 | jitter=.3
757 | ignore_thresh = .7
758 | truth_thresh = 1
759 | random=1
760 |
761 |
762 |
763 | [route]
764 | layers = -4
765 |
766 | [convolutional]
767 | batch_normalize=1
768 | filters=128
769 | size=1
770 | stride=1
771 | pad=1
772 | activation=leaky
773 |
774 | [upsample]
775 | stride=2
776 |
777 | [route]
778 | layers = -1, 36
779 |
780 |
781 |
782 | [convolutional]
783 | batch_normalize=1
784 | filters=128
785 | size=1
786 | stride=1
787 | pad=1
788 | activation=leaky
789 |
790 | [convolutional]
791 | batch_normalize=1
792 | size=3
793 | stride=1
794 | pad=1
795 | filters=256
796 | activation=leaky
797 |
798 | [convolutional]
799 | batch_normalize=1
800 | filters=128
801 | size=1
802 | stride=1
803 | pad=1
804 | activation=leaky
805 |
806 | ### SPP ###
807 | [maxpool]
808 | stride=1
809 | size=5
810 |
811 | [route]
812 | layers=-2
813 |
814 | [maxpool]
815 | stride=1
816 | size=9
817 |
818 | [route]
819 | layers=-4
820 |
821 | [maxpool]
822 | stride=1
823 | size=13
824 |
825 | [route]
826 | layers=-1,-3,-5,-6
827 |
828 | ### End SPP ###
829 |
830 | [convolutional]
831 | batch_normalize=1
832 | size=3
833 | stride=1
834 | pad=1
835 | filters=256
836 | activation=leaky
837 |
838 | [convolutional]
839 | batch_normalize=1
840 | filters=128
841 | size=1
842 | stride=1
843 | pad=1
844 | activation=leaky
845 |
846 | [convolutional]
847 | batch_normalize=1
848 | size=3
849 | stride=1
850 | pad=1
851 | filters=256
852 | activation=leaky
853 |
854 | [convolutional]
855 | size=1
856 | stride=1
857 | pad=1
858 | filters=255
859 | activation=linear
860 |
861 |
862 | [yolo]
863 | mask = 0,1,2
864 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
865 | classes=80
866 | num=9
867 | jitter=.3
868 | ignore_thresh = .7
869 | truth_thresh = 1
870 | random=1
871 |
--------------------------------------------------------------------------------
/yolov3/cfg/yolov3-tiny-1cls.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | batch=1
4 | subdivisions=1
5 | # Training
6 | # batch=64
7 | # subdivisions=2
8 | width=416
9 | height=416
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 500200
21 | policy=steps
22 | steps=400000,450000
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=16
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | [maxpool]
34 | size=2
35 | stride=2
36 |
37 | [convolutional]
38 | batch_normalize=1
39 | filters=32
40 | size=3
41 | stride=1
42 | pad=1
43 | activation=leaky
44 |
45 | [maxpool]
46 | size=2
47 | stride=2
48 |
49 | [convolutional]
50 | batch_normalize=1
51 | filters=64
52 | size=3
53 | stride=1
54 | pad=1
55 | activation=leaky
56 |
57 | [maxpool]
58 | size=2
59 | stride=2
60 |
61 | [convolutional]
62 | batch_normalize=1
63 | filters=128
64 | size=3
65 | stride=1
66 | pad=1
67 | activation=leaky
68 |
69 | [maxpool]
70 | size=2
71 | stride=2
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=256
76 | size=3
77 | stride=1
78 | pad=1
79 | activation=leaky
80 |
81 | [maxpool]
82 | size=2
83 | stride=2
84 |
85 | [convolutional]
86 | batch_normalize=1
87 | filters=512
88 | size=3
89 | stride=1
90 | pad=1
91 | activation=leaky
92 |
93 | [maxpool]
94 | size=2
95 | stride=1
96 |
97 | [convolutional]
98 | batch_normalize=1
99 | filters=1024
100 | size=3
101 | stride=1
102 | pad=1
103 | activation=leaky
104 |
105 | ###########
106 |
107 | [convolutional]
108 | batch_normalize=1
109 | filters=256
110 | size=1
111 | stride=1
112 | pad=1
113 | activation=leaky
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=512
118 | size=3
119 | stride=1
120 | pad=1
121 | activation=leaky
122 |
123 | [convolutional]
124 | size=1
125 | stride=1
126 | pad=1
127 | filters=18
128 | activation=linear
129 |
130 |
131 |
132 | [yolo]
133 | mask = 3,4,5
134 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319
135 | classes=1
136 | num=6
137 | jitter=.3
138 | ignore_thresh = .7
139 | truth_thresh = 1
140 | random=1
141 |
142 | [route]
143 | layers = -4
144 |
145 | [convolutional]
146 | batch_normalize=1
147 | filters=128
148 | size=1
149 | stride=1
150 | pad=1
151 | activation=leaky
152 |
153 | [upsample]
154 | stride=2
155 |
156 | [route]
157 | layers = -1, 8
158 |
159 | [convolutional]
160 | batch_normalize=1
161 | filters=256
162 | size=3
163 | stride=1
164 | pad=1
165 | activation=leaky
166 |
167 | [convolutional]
168 | size=1
169 | stride=1
170 | pad=1
171 | filters=18
172 | activation=linear
173 |
174 | [yolo]
175 | mask = 0,1,2
176 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319
177 | classes=1
178 | num=6
179 | jitter=.3
180 | ignore_thresh = .7
181 | truth_thresh = 1
182 | random=1
183 |
--------------------------------------------------------------------------------
/yolov3/cfg/yolov3-tiny-3cls.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | batch=1
4 | subdivisions=1
5 | # Training
6 | # batch=64
7 | # subdivisions=2
8 | width=416
9 | height=416
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 500200
21 | policy=steps
22 | steps=400000,450000
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=16
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | [maxpool]
34 | size=2
35 | stride=2
36 |
37 | [convolutional]
38 | batch_normalize=1
39 | filters=32
40 | size=3
41 | stride=1
42 | pad=1
43 | activation=leaky
44 |
45 | [maxpool]
46 | size=2
47 | stride=2
48 |
49 | [convolutional]
50 | batch_normalize=1
51 | filters=64
52 | size=3
53 | stride=1
54 | pad=1
55 | activation=leaky
56 |
57 | [maxpool]
58 | size=2
59 | stride=2
60 |
61 | [convolutional]
62 | batch_normalize=1
63 | filters=128
64 | size=3
65 | stride=1
66 | pad=1
67 | activation=leaky
68 |
69 | [maxpool]
70 | size=2
71 | stride=2
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=256
76 | size=3
77 | stride=1
78 | pad=1
79 | activation=leaky
80 |
81 | [maxpool]
82 | size=2
83 | stride=2
84 |
85 | [convolutional]
86 | batch_normalize=1
87 | filters=512
88 | size=3
89 | stride=1
90 | pad=1
91 | activation=leaky
92 |
93 | [maxpool]
94 | size=2
95 | stride=1
96 |
97 | [convolutional]
98 | batch_normalize=1
99 | filters=1024
100 | size=3
101 | stride=1
102 | pad=1
103 | activation=leaky
104 |
105 | ###########
106 |
107 | [convolutional]
108 | batch_normalize=1
109 | filters=256
110 | size=1
111 | stride=1
112 | pad=1
113 | activation=leaky
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=512
118 | size=3
119 | stride=1
120 | pad=1
121 | activation=leaky
122 |
123 | [convolutional]
124 | size=1
125 | stride=1
126 | pad=1
127 | filters=24
128 | activation=linear
129 |
130 |
131 |
132 | [yolo]
133 | mask = 3,4,5
134 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319
135 | classes=3
136 | num=6
137 | jitter=.3
138 | ignore_thresh = .7
139 | truth_thresh = 1
140 | random=1
141 |
142 | [route]
143 | layers = -4
144 |
145 | [convolutional]
146 | batch_normalize=1
147 | filters=128
148 | size=1
149 | stride=1
150 | pad=1
151 | activation=leaky
152 |
153 | [upsample]
154 | stride=2
155 |
156 | [route]
157 | layers = -1, 8
158 |
159 | [convolutional]
160 | batch_normalize=1
161 | filters=256
162 | size=3
163 | stride=1
164 | pad=1
165 | activation=leaky
166 |
167 | [convolutional]
168 | size=1
169 | stride=1
170 | pad=1
171 | filters=24
172 | activation=linear
173 |
174 | [yolo]
175 | mask = 0,1,2
176 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319
177 | classes=3
178 | num=6
179 | jitter=.3
180 | ignore_thresh = .7
181 | truth_thresh = 1
182 | random=1
183 |
--------------------------------------------------------------------------------
/yolov3/cfg/yolov3-tiny.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | batch=1
4 | subdivisions=1
5 | # Training
6 | # batch=64
7 | # subdivisions=2
8 | width=416
9 | height=416
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 500200
21 | policy=steps
22 | steps=400000,450000
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=16
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | [maxpool]
34 | size=2
35 | stride=2
36 |
37 | [convolutional]
38 | batch_normalize=1
39 | filters=32
40 | size=3
41 | stride=1
42 | pad=1
43 | activation=leaky
44 |
45 | [maxpool]
46 | size=2
47 | stride=2
48 |
49 | [convolutional]
50 | batch_normalize=1
51 | filters=64
52 | size=3
53 | stride=1
54 | pad=1
55 | activation=leaky
56 |
57 | [maxpool]
58 | size=2
59 | stride=2
60 |
61 | [convolutional]
62 | batch_normalize=1
63 | filters=128
64 | size=3
65 | stride=1
66 | pad=1
67 | activation=leaky
68 |
69 | [maxpool]
70 | size=2
71 | stride=2
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=256
76 | size=3
77 | stride=1
78 | pad=1
79 | activation=leaky
80 |
81 | [maxpool]
82 | size=2
83 | stride=2
84 |
85 | [convolutional]
86 | batch_normalize=1
87 | filters=512
88 | size=3
89 | stride=1
90 | pad=1
91 | activation=leaky
92 |
93 | [maxpool]
94 | size=2
95 | stride=1
96 |
97 | [convolutional]
98 | batch_normalize=1
99 | filters=1024
100 | size=3
101 | stride=1
102 | pad=1
103 | activation=leaky
104 |
105 | ###########
106 |
107 | [convolutional]
108 | batch_normalize=1
109 | filters=256
110 | size=1
111 | stride=1
112 | pad=1
113 | activation=leaky
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=512
118 | size=3
119 | stride=1
120 | pad=1
121 | activation=leaky
122 |
123 | [convolutional]
124 | size=1
125 | stride=1
126 | pad=1
127 | filters=255
128 | activation=linear
129 |
130 |
131 |
132 | [yolo]
133 | mask = 3,4,5
134 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319
135 | classes=80
136 | num=6
137 | jitter=.3
138 | ignore_thresh = .7
139 | truth_thresh = 1
140 | random=1
141 |
142 | [route]
143 | layers = -4
144 |
145 | [convolutional]
146 | batch_normalize=1
147 | filters=128
148 | size=1
149 | stride=1
150 | pad=1
151 | activation=leaky
152 |
153 | [upsample]
154 | stride=2
155 |
156 | [route]
157 | layers = -1, 8
158 |
159 | [convolutional]
160 | batch_normalize=1
161 | filters=256
162 | size=3
163 | stride=1
164 | pad=1
165 | activation=leaky
166 |
167 | [convolutional]
168 | size=1
169 | stride=1
170 | pad=1
171 | filters=255
172 | activation=linear
173 |
174 | [yolo]
175 | mask = 1,2,3
176 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319
177 | classes=80
178 | num=6
179 | jitter=.3
180 | ignore_thresh = .7
181 | truth_thresh = 1
182 | random=1
183 |
--------------------------------------------------------------------------------
/yolov3/cfg/yolov3-tiny3-1cls.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | # batch=1
4 | # subdivisions=1
5 | # Training
6 | batch=64
7 | subdivisions=16
8 | width=608
9 | height=608
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 200000
21 | policy=steps
22 | steps=180000,190000
23 | scales=.1,.1
24 |
25 |
26 | [convolutional]
27 | batch_normalize=1
28 | filters=16
29 | size=3
30 | stride=1
31 | pad=1
32 | activation=leaky
33 |
34 | [maxpool]
35 | size=2
36 | stride=2
37 |
38 | [convolutional]
39 | batch_normalize=1
40 | filters=32
41 | size=3
42 | stride=1
43 | pad=1
44 | activation=leaky
45 |
46 | [maxpool]
47 | size=2
48 | stride=2
49 |
50 | [convolutional]
51 | batch_normalize=1
52 | filters=64
53 | size=3
54 | stride=1
55 | pad=1
56 | activation=leaky
57 |
58 | [maxpool]
59 | size=2
60 | stride=2
61 |
62 | [convolutional]
63 | batch_normalize=1
64 | filters=128
65 | size=3
66 | stride=1
67 | pad=1
68 | activation=leaky
69 |
70 | [maxpool]
71 | size=2
72 | stride=2
73 |
74 | [convolutional]
75 | batch_normalize=1
76 | filters=256
77 | size=3
78 | stride=1
79 | pad=1
80 | activation=leaky
81 |
82 | [maxpool]
83 | size=2
84 | stride=2
85 |
86 | [convolutional]
87 | batch_normalize=1
88 | filters=512
89 | size=3
90 | stride=1
91 | pad=1
92 | activation=leaky
93 |
94 | [maxpool]
95 | size=2
96 | stride=1
97 |
98 | [convolutional]
99 | batch_normalize=1
100 | filters=1024
101 | size=3
102 | stride=1
103 | pad=1
104 | activation=leaky
105 |
106 | ###########
107 |
108 | [convolutional]
109 | batch_normalize=1
110 | filters=256
111 | size=1
112 | stride=1
113 | pad=1
114 | activation=leaky
115 |
116 | [convolutional]
117 | batch_normalize=1
118 | filters=512
119 | size=3
120 | stride=1
121 | pad=1
122 | activation=leaky
123 |
124 | [convolutional]
125 | size=1
126 | stride=1
127 | pad=1
128 | filters=18
129 | activation=linear
130 |
131 |
132 |
133 | [yolo]
134 | mask = 6,7,8
135 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
136 | classes=1
137 | num=9
138 | jitter=.3
139 | ignore_thresh = .7
140 | truth_thresh = 1
141 | random=1
142 |
143 | [route]
144 | layers = -4
145 |
146 | [convolutional]
147 | batch_normalize=1
148 | filters=128
149 | size=1
150 | stride=1
151 | pad=1
152 | activation=leaky
153 |
154 | [upsample]
155 | stride=2
156 |
157 | [route]
158 | layers = -1, 8
159 |
160 | [convolutional]
161 | batch_normalize=1
162 | filters=256
163 | size=3
164 | stride=1
165 | pad=1
166 | activation=leaky
167 |
168 | [convolutional]
169 | size=1
170 | stride=1
171 | pad=1
172 | filters=18
173 | activation=linear
174 |
175 | [yolo]
176 | mask = 3,4,5
177 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
178 | classes=1
179 | num=9
180 | jitter=.3
181 | ignore_thresh = .7
182 | truth_thresh = 1
183 | random=1
184 |
185 |
186 |
187 | [route]
188 | layers = -3
189 |
190 | [convolutional]
191 | batch_normalize=1
192 | filters=128
193 | size=1
194 | stride=1
195 | pad=1
196 | activation=leaky
197 |
198 | [upsample]
199 | stride=2
200 |
201 | [route]
202 | layers = -1, 6
203 |
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=3
208 | stride=1
209 | pad=1
210 | activation=leaky
211 |
212 | [convolutional]
213 | size=1
214 | stride=1
215 | pad=1
216 | filters=18
217 | activation=linear
218 |
219 | [yolo]
220 | mask = 0,1,2
221 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
222 | classes=1
223 | num=9
224 | jitter=.3
225 | ignore_thresh = .7
226 | truth_thresh = 1
227 | random=1
228 |
--------------------------------------------------------------------------------
/yolov3/cfg/yolov3-tiny3.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | # batch=1
4 | # subdivisions=1
5 | # Training
6 | batch=64
7 | subdivisions=16
8 | width=608
9 | height=608
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 200000
21 | policy=steps
22 | steps=180000,190000
23 | scales=.1,.1
24 |
25 |
26 | [convolutional]
27 | batch_normalize=1
28 | filters=16
29 | size=3
30 | stride=1
31 | pad=1
32 | activation=leaky
33 |
34 | [maxpool]
35 | size=2
36 | stride=2
37 |
38 | [convolutional]
39 | batch_normalize=1
40 | filters=32
41 | size=3
42 | stride=1
43 | pad=1
44 | activation=leaky
45 |
46 | [maxpool]
47 | size=2
48 | stride=2
49 |
50 | [convolutional]
51 | batch_normalize=1
52 | filters=64
53 | size=3
54 | stride=1
55 | pad=1
56 | activation=leaky
57 |
58 | [maxpool]
59 | size=2
60 | stride=2
61 |
62 | [convolutional]
63 | batch_normalize=1
64 | filters=128
65 | size=3
66 | stride=1
67 | pad=1
68 | activation=leaky
69 |
70 | [maxpool]
71 | size=2
72 | stride=2
73 |
74 | [convolutional]
75 | batch_normalize=1
76 | filters=256
77 | size=3
78 | stride=1
79 | pad=1
80 | activation=leaky
81 |
82 | [maxpool]
83 | size=2
84 | stride=2
85 |
86 | [convolutional]
87 | batch_normalize=1
88 | filters=512
89 | size=3
90 | stride=1
91 | pad=1
92 | activation=leaky
93 |
94 | [maxpool]
95 | size=2
96 | stride=1
97 |
98 | [convolutional]
99 | batch_normalize=1
100 | filters=1024
101 | size=3
102 | stride=1
103 | pad=1
104 | activation=leaky
105 |
106 | ###########
107 |
108 | [convolutional]
109 | batch_normalize=1
110 | filters=256
111 | size=1
112 | stride=1
113 | pad=1
114 | activation=leaky
115 |
116 | [convolutional]
117 | batch_normalize=1
118 | filters=512
119 | size=3
120 | stride=1
121 | pad=1
122 | activation=leaky
123 |
124 | [convolutional]
125 | size=1
126 | stride=1
127 | pad=1
128 | filters=255
129 | activation=linear
130 |
131 |
132 |
133 | [yolo]
134 | mask = 6,7,8
135 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
136 | classes=80
137 | num=9
138 | jitter=.3
139 | ignore_thresh = .7
140 | truth_thresh = 1
141 | random=1
142 |
143 | [route]
144 | layers = -4
145 |
146 | [convolutional]
147 | batch_normalize=1
148 | filters=128
149 | size=1
150 | stride=1
151 | pad=1
152 | activation=leaky
153 |
154 | [upsample]
155 | stride=2
156 |
157 | [route]
158 | layers = -1, 8
159 |
160 | [convolutional]
161 | batch_normalize=1
162 | filters=256
163 | size=3
164 | stride=1
165 | pad=1
166 | activation=leaky
167 |
168 | [convolutional]
169 | size=1
170 | stride=1
171 | pad=1
172 | filters=255
173 | activation=linear
174 |
175 | [yolo]
176 | mask = 3,4,5
177 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
178 | classes=80
179 | num=9
180 | jitter=.3
181 | ignore_thresh = .7
182 | truth_thresh = 1
183 | random=1
184 |
185 |
186 |
187 | [route]
188 | layers = -3
189 |
190 | [convolutional]
191 | batch_normalize=1
192 | filters=128
193 | size=1
194 | stride=1
195 | pad=1
196 | activation=leaky
197 |
198 | [upsample]
199 | stride=2
200 |
201 | [route]
202 | layers = -1, 6
203 |
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=3
208 | stride=1
209 | pad=1
210 | activation=leaky
211 |
212 | [convolutional]
213 | size=1
214 | stride=1
215 | pad=1
216 | filters=255
217 | activation=linear
218 |
219 | [yolo]
220 | mask = 0,1,2
221 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
222 | classes=80
223 | num=9
224 | jitter=.3
225 | ignore_thresh = .7
226 | truth_thresh = 1
227 | random=1
228 |
--------------------------------------------------------------------------------
/yolov3/cfg/yolov3.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | #batch=1
4 | #subdivisions=1
5 | # Training
6 | batch=16
7 | subdivisions=1
8 | width=416
9 | height=416
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 500200
21 | policy=steps
22 | steps=400000,450000
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=32
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | # Downsample
34 |
35 | [convolutional]
36 | batch_normalize=1
37 | filters=64
38 | size=3
39 | stride=2
40 | pad=1
41 | activation=leaky
42 |
43 | [convolutional]
44 | batch_normalize=1
45 | filters=32
46 | size=1
47 | stride=1
48 | pad=1
49 | activation=leaky
50 |
51 | [convolutional]
52 | batch_normalize=1
53 | filters=64
54 | size=3
55 | stride=1
56 | pad=1
57 | activation=leaky
58 |
59 | [shortcut]
60 | from=-3
61 | activation=linear
62 |
63 | # Downsample
64 |
65 | [convolutional]
66 | batch_normalize=1
67 | filters=128
68 | size=3
69 | stride=2
70 | pad=1
71 | activation=leaky
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=64
76 | size=1
77 | stride=1
78 | pad=1
79 | activation=leaky
80 |
81 | [convolutional]
82 | batch_normalize=1
83 | filters=128
84 | size=3
85 | stride=1
86 | pad=1
87 | activation=leaky
88 |
89 | [shortcut]
90 | from=-3
91 | activation=linear
92 |
93 | [convolutional]
94 | batch_normalize=1
95 | filters=64
96 | size=1
97 | stride=1
98 | pad=1
99 | activation=leaky
100 |
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 |
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 |
113 | # Downsample
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 |
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 |
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 |
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 |
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 |
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 |
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 |
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 |
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 |
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 |
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 |
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 |
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 |
203 |
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 |
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 |
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 |
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 |
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 |
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 |
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 |
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 |
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 |
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 |
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 |
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 |
284 | # Downsample
285 |
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 |
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 |
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 |
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 |
314 |
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 |
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 |
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 |
335 |
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 |
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 |
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 |
356 |
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 |
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 |
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 |
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 |
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 |
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 |
397 |
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 |
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 |
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 |
418 |
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 |
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 |
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 |
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 |
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 |
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 |
459 | # Downsample
460 |
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 |
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 |
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 |
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 |
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 |
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 |
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 |
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 |
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 |
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 |
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 |
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 |
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 |
549 | ######################
550 |
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 |
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 |
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 |
575 | [convolutional]
576 | batch_normalize=1
577 | size=3
578 | stride=1
579 | pad=1
580 | filters=1024
581 | activation=leaky
582 |
583 | [convolutional]
584 | batch_normalize=1
585 | filters=512
586 | size=1
587 | stride=1
588 | pad=1
589 | activation=leaky
590 |
591 | [convolutional]
592 | batch_normalize=1
593 | size=3
594 | stride=1
595 | pad=1
596 | filters=1024
597 | activation=leaky
598 |
599 | [convolutional]
600 | size=1
601 | stride=1
602 | pad=1
603 | filters=255
604 | activation=linear
605 |
606 |
607 | [yolo]
608 | mask = 6,7,8
609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
610 | classes=80
611 | num=9
612 | jitter=.3
613 | ignore_thresh = .7
614 | truth_thresh = 1
615 | random=1
616 |
617 |
618 | [route]
619 | layers = -4
620 |
621 | [convolutional]
622 | batch_normalize=1
623 | filters=256
624 | size=1
625 | stride=1
626 | pad=1
627 | activation=leaky
628 |
629 | [upsample]
630 | stride=2
631 |
632 | [route]
633 | layers = -1, 61
634 |
635 |
636 |
637 | [convolutional]
638 | batch_normalize=1
639 | filters=256
640 | size=1
641 | stride=1
642 | pad=1
643 | activation=leaky
644 |
645 | [convolutional]
646 | batch_normalize=1
647 | size=3
648 | stride=1
649 | pad=1
650 | filters=512
651 | activation=leaky
652 |
653 | [convolutional]
654 | batch_normalize=1
655 | filters=256
656 | size=1
657 | stride=1
658 | pad=1
659 | activation=leaky
660 |
661 | [convolutional]
662 | batch_normalize=1
663 | size=3
664 | stride=1
665 | pad=1
666 | filters=512
667 | activation=leaky
668 |
669 | [convolutional]
670 | batch_normalize=1
671 | filters=256
672 | size=1
673 | stride=1
674 | pad=1
675 | activation=leaky
676 |
677 | [convolutional]
678 | batch_normalize=1
679 | size=3
680 | stride=1
681 | pad=1
682 | filters=512
683 | activation=leaky
684 |
685 | [convolutional]
686 | size=1
687 | stride=1
688 | pad=1
689 | filters=255
690 | activation=linear
691 |
692 |
693 | [yolo]
694 | mask = 3,4,5
695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
696 | classes=80
697 | num=9
698 | jitter=.3
699 | ignore_thresh = .7
700 | truth_thresh = 1
701 | random=1
702 |
703 |
704 |
705 | [route]
706 | layers = -4
707 |
708 | [convolutional]
709 | batch_normalize=1
710 | filters=128
711 | size=1
712 | stride=1
713 | pad=1
714 | activation=leaky
715 |
716 | [upsample]
717 | stride=2
718 |
719 | [route]
720 | layers = -1, 36
721 |
722 |
723 |
724 | [convolutional]
725 | batch_normalize=1
726 | filters=128
727 | size=1
728 | stride=1
729 | pad=1
730 | activation=leaky
731 |
732 | [convolutional]
733 | batch_normalize=1
734 | size=3
735 | stride=1
736 | pad=1
737 | filters=256
738 | activation=leaky
739 |
740 | [convolutional]
741 | batch_normalize=1
742 | filters=128
743 | size=1
744 | stride=1
745 | pad=1
746 | activation=leaky
747 |
748 | [convolutional]
749 | batch_normalize=1
750 | size=3
751 | stride=1
752 | pad=1
753 | filters=256
754 | activation=leaky
755 |
756 | [convolutional]
757 | batch_normalize=1
758 | filters=128
759 | size=1
760 | stride=1
761 | pad=1
762 | activation=leaky
763 |
764 | [convolutional]
765 | batch_normalize=1
766 | size=3
767 | stride=1
768 | pad=1
769 | filters=256
770 | activation=leaky
771 |
772 | [convolutional]
773 | size=1
774 | stride=1
775 | pad=1
776 | filters=255
777 | activation=linear
778 |
779 |
780 | [yolo]
781 | mask = 0,1,2
782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
783 | classes=80
784 | num=9
785 | jitter=.3
786 | ignore_thresh = .7
787 | truth_thresh = 1
788 | random=1
789 |
--------------------------------------------------------------------------------
/yolov3/cfg/yolov3s.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | # batch=1
4 | # subdivisions=1
5 | # Training
6 | batch=64
7 | subdivisions=16
8 | width=608
9 | height=608
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 500200
21 | policy=steps
22 | steps=400000,450000
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=32
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=swish
32 |
33 | # Downsample
34 |
35 | [convolutional]
36 | batch_normalize=1
37 | filters=64
38 | size=3
39 | stride=2
40 | pad=1
41 | activation=swish
42 |
43 | [convolutional]
44 | batch_normalize=1
45 | filters=32
46 | size=1
47 | stride=1
48 | pad=1
49 | activation=swish
50 |
51 | [convolutional]
52 | batch_normalize=1
53 | filters=64
54 | size=3
55 | stride=1
56 | pad=1
57 | activation=swish
58 |
59 | [shortcut]
60 | from=-3
61 | activation=linear
62 |
63 | # Downsample
64 |
65 | [convolutional]
66 | batch_normalize=1
67 | filters=128
68 | size=3
69 | stride=2
70 | pad=1
71 | activation=swish
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=64
76 | size=1
77 | stride=1
78 | pad=1
79 | activation=swish
80 |
81 | [convolutional]
82 | batch_normalize=1
83 | filters=128
84 | size=3
85 | stride=1
86 | pad=1
87 | activation=swish
88 |
89 | [shortcut]
90 | from=-3
91 | activation=linear
92 |
93 | [convolutional]
94 | batch_normalize=1
95 | filters=64
96 | size=1
97 | stride=1
98 | pad=1
99 | activation=swish
100 |
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=swish
108 |
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 |
113 | # Downsample
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=swish
122 |
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=swish
130 |
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=swish
138 |
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 |
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=swish
150 |
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=swish
158 |
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 |
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=swish
170 |
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=swish
178 |
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 |
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=swish
190 |
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=swish
198 |
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 |
203 |
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=swish
211 |
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=swish
219 |
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 |
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=swish
231 |
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=swish
239 |
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 |
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=swish
251 |
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=swish
259 |
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 |
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=swish
271 |
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=swish
279 |
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 |
284 | # Downsample
285 |
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=swish
293 |
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=swish
301 |
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=swish
309 |
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 |
314 |
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=swish
322 |
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=swish
330 |
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 |
335 |
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=swish
343 |
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=swish
351 |
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 |
356 |
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=swish
364 |
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=swish
372 |
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 |
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=swish
384 |
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=swish
392 |
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 |
397 |
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=swish
405 |
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=swish
413 |
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 |
418 |
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=swish
426 |
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=swish
434 |
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 |
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=swish
446 |
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=swish
454 |
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 |
459 | # Downsample
460 |
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=swish
468 |
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=swish
476 |
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=swish
484 |
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 |
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=swish
496 |
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=swish
504 |
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 |
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=swish
516 |
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=swish
524 |
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 |
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=swish
536 |
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=swish
544 |
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 |
549 | ######################
550 |
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=swish
558 |
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=swish
566 |
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=swish
574 |
575 | ### SPP ###
576 | [maxpool]
577 | stride=1
578 | size=5
579 |
580 | [route]
581 | layers=-2
582 |
583 | [maxpool]
584 | stride=1
585 | size=9
586 |
587 | [route]
588 | layers=-4
589 |
590 | [maxpool]
591 | stride=1
592 | size=13
593 |
594 | [route]
595 | layers=-1,-3,-5,-6
596 |
597 | ### End SPP ###
598 |
599 | [convolutional]
600 | batch_normalize=1
601 | filters=512
602 | size=1
603 | stride=1
604 | pad=1
605 | activation=swish
606 |
607 |
608 | [convolutional]
609 | batch_normalize=1
610 | size=3
611 | stride=1
612 | pad=1
613 | filters=1024
614 | activation=swish
615 |
616 | [convolutional]
617 | batch_normalize=1
618 | filters=512
619 | size=1
620 | stride=1
621 | pad=1
622 | activation=swish
623 |
624 | [convolutional]
625 | batch_normalize=1
626 | size=3
627 | stride=1
628 | pad=1
629 | filters=1024
630 | activation=swish
631 |
632 | [convolutional]
633 | size=1
634 | stride=1
635 | pad=1
636 | filters=255
637 | activation=linear
638 |
639 |
640 | [yolo]
641 | mask = 6,7,8
642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
643 | classes=80
644 | num=9
645 | jitter=.3
646 | ignore_thresh = .7
647 | truth_thresh = 1
648 | random=1
649 |
650 |
651 | [route]
652 | layers = -4
653 |
654 | [convolutional]
655 | batch_normalize=1
656 | filters=256
657 | size=1
658 | stride=1
659 | pad=1
660 | activation=swish
661 |
662 | [upsample]
663 | stride=2
664 |
665 | [route]
666 | layers = -1, 61
667 |
668 |
669 |
670 | [convolutional]
671 | batch_normalize=1
672 | filters=256
673 | size=1
674 | stride=1
675 | pad=1
676 | activation=swish
677 |
678 | [convolutional]
679 | batch_normalize=1
680 | size=3
681 | stride=1
682 | pad=1
683 | filters=512
684 | activation=swish
685 |
686 | [convolutional]
687 | batch_normalize=1
688 | filters=256
689 | size=1
690 | stride=1
691 | pad=1
692 | activation=swish
693 |
694 | [convolutional]
695 | batch_normalize=1
696 | size=3
697 | stride=1
698 | pad=1
699 | filters=512
700 | activation=swish
701 |
702 | [convolutional]
703 | batch_normalize=1
704 | filters=256
705 | size=1
706 | stride=1
707 | pad=1
708 | activation=swish
709 |
710 | [convolutional]
711 | batch_normalize=1
712 | size=3
713 | stride=1
714 | pad=1
715 | filters=512
716 | activation=swish
717 |
718 | [convolutional]
719 | size=1
720 | stride=1
721 | pad=1
722 | filters=255
723 | activation=linear
724 |
725 |
726 | [yolo]
727 | mask = 3,4,5
728 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
729 | classes=80
730 | num=9
731 | jitter=.3
732 | ignore_thresh = .7
733 | truth_thresh = 1
734 | random=1
735 |
736 |
737 |
738 | [route]
739 | layers = -4
740 |
741 | [convolutional]
742 | batch_normalize=1
743 | filters=128
744 | size=1
745 | stride=1
746 | pad=1
747 | activation=swish
748 |
749 | [upsample]
750 | stride=2
751 |
752 | [route]
753 | layers = -1, 36
754 |
755 |
756 |
757 | [convolutional]
758 | batch_normalize=1
759 | filters=128
760 | size=1
761 | stride=1
762 | pad=1
763 | activation=swish
764 |
765 | [convolutional]
766 | batch_normalize=1
767 | size=3
768 | stride=1
769 | pad=1
770 | filters=256
771 | activation=swish
772 |
773 | [convolutional]
774 | batch_normalize=1
775 | filters=128
776 | size=1
777 | stride=1
778 | pad=1
779 | activation=swish
780 |
781 | [convolutional]
782 | batch_normalize=1
783 | size=3
784 | stride=1
785 | pad=1
786 | filters=256
787 | activation=swish
788 |
789 | [convolutional]
790 | batch_normalize=1
791 | filters=128
792 | size=1
793 | stride=1
794 | pad=1
795 | activation=swish
796 |
797 | [convolutional]
798 | batch_normalize=1
799 | size=3
800 | stride=1
801 | pad=1
802 | filters=256
803 | activation=swish
804 |
805 | [convolutional]
806 | size=1
807 | stride=1
808 | pad=1
809 | filters=255
810 | activation=linear
811 |
812 |
813 | [yolo]
814 | mask = 0,1,2
815 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
816 | classes=80
817 | num=9
818 | jitter=.3
819 | ignore_thresh = .7
820 | truth_thresh = 1
821 | random=1
822 |
--------------------------------------------------------------------------------
/yolov3/cfg/yolov4-tiny-1cls.cfg:
--------------------------------------------------------------------------------
1 | # Generated by Glenn Jocher (glenn.jocher@ultralytics.com) for https://github.com/ultralytics/yolov3
2 | # def kmean_anchors(path='../coco/train2017.txt', n=12, img_size=(320, 640)): # from utils.utils import *; kmean_anchors()
3 | # Evolving anchors: 100%|██████████| 1000/1000 [41:15<00:00, 2.48s/it]
4 | # 0.20 iou_thr: 0.992 best possible recall, 4.25 anchors > thr
5 | # kmeans anchors (n=12, img_size=(320, 640), IoU=0.005/0.184/0.634-min/mean/best): 6,9, 15,16, 17,35, 37,26, 36,67, 63,42, 57,100, 121,81, 112,169, 241,158, 195,310, 426,359
6 |
7 | [net]
8 | # Testing
9 | # batch=1
10 | # subdivisions=1
11 | # Training
12 | batch=64
13 | subdivisions=16
14 | width=608
15 | height=608
16 | channels=3
17 | momentum=0.9
18 | decay=0.0005
19 | angle=0
20 | saturation = 1.5
21 | exposure = 1.5
22 | hue=.1
23 |
24 | learning_rate=0.001
25 | burn_in=1000
26 | max_batches = 200000
27 | policy=steps
28 | steps=180000,190000
29 | scales=.1,.1
30 |
31 |
32 | [convolutional]
33 | batch_normalize=1
34 | filters=16
35 | size=3
36 | stride=1
37 | pad=1
38 | activation=leaky
39 |
40 | [maxpool]
41 | size=2
42 | stride=2
43 |
44 | [convolutional]
45 | batch_normalize=1
46 | filters=32
47 | size=3
48 | stride=1
49 | pad=1
50 | activation=leaky
51 |
52 | [maxpool]
53 | size=2
54 | stride=2
55 |
56 | [convolutional]
57 | batch_normalize=1
58 | filters=64
59 | size=3
60 | stride=1
61 | pad=1
62 | activation=leaky
63 |
64 | [maxpool]
65 | size=2
66 | stride=2
67 |
68 | [convolutional]
69 | batch_normalize=1
70 | filters=128
71 | size=3
72 | stride=1
73 | pad=1
74 | activation=leaky
75 |
76 | [maxpool]
77 | size=2
78 | stride=2
79 |
80 | [convolutional]
81 | batch_normalize=1
82 | filters=256
83 | size=3
84 | stride=1
85 | pad=1
86 | activation=leaky
87 |
88 | [maxpool]
89 | size=2
90 | stride=2
91 |
92 | [convolutional]
93 | batch_normalize=1
94 | filters=512
95 | size=3
96 | stride=1
97 | pad=1
98 | activation=leaky
99 |
100 | [maxpool]
101 | size=2
102 | stride=1
103 |
104 | [convolutional]
105 | batch_normalize=1
106 | filters=1024
107 | size=3
108 | stride=1
109 | pad=1
110 | activation=leaky
111 |
112 | ###########
113 |
114 | [convolutional]
115 | batch_normalize=1
116 | filters=256
117 | size=1
118 | stride=1
119 | pad=1
120 | activation=leaky
121 |
122 | [convolutional]
123 | batch_normalize=1
124 | filters=512
125 | size=3
126 | stride=1
127 | pad=1
128 | activation=leaky
129 |
130 | [convolutional]
131 | size=1
132 | stride=1
133 | pad=1
134 | filters=24
135 | activation=linear
136 |
137 |
138 |
139 | [yolo]
140 | mask = 8,9,10,11
141 | anchors = 6,9, 15,16, 17,35, 37,26, 36,67, 63,42, 57,100, 121,81, 112,169, 241,158, 195,310, 426,359
142 | classes=1
143 | num=12
144 | jitter=.3
145 | ignore_thresh = .7
146 | truth_thresh = 1
147 | random=1
148 |
149 | [route]
150 | layers = -4
151 |
152 | [convolutional]
153 | batch_normalize=1
154 | filters=128
155 | size=1
156 | stride=1
157 | pad=1
158 | activation=leaky
159 |
160 | [upsample]
161 | stride=2
162 |
163 | [route]
164 | layers = -1, 8
165 |
166 | [convolutional]
167 | batch_normalize=1
168 | filters=256
169 | size=3
170 | stride=1
171 | pad=1
172 | activation=leaky
173 |
174 | [convolutional]
175 | size=1
176 | stride=1
177 | pad=1
178 | filters=24
179 | activation=linear
180 |
181 | [yolo]
182 | mask = 4,5,6,7
183 | anchors = 6,9, 15,16, 17,35, 37,26, 36,67, 63,42, 57,100, 121,81, 112,169, 241,158, 195,310, 426,359
184 | classes=1
185 | num=12
186 | jitter=.3
187 | ignore_thresh = .7
188 | truth_thresh = 1
189 | random=1
190 |
191 |
192 |
193 | [route]
194 | layers = -3
195 |
196 | [convolutional]
197 | batch_normalize=1
198 | filters=128
199 | size=1
200 | stride=1
201 | pad=1
202 | activation=leaky
203 |
204 | [upsample]
205 | stride=2
206 |
207 | [route]
208 | layers = -1, 6
209 |
210 | [convolutional]
211 | batch_normalize=1
212 | filters=128
213 | size=3
214 | stride=1
215 | pad=1
216 | activation=leaky
217 |
218 | [convolutional]
219 | size=1
220 | stride=1
221 | pad=1
222 | filters=24
223 | activation=linear
224 |
225 | [yolo]
226 | mask = 0,1,2,3
227 | anchors = 6,9, 15,16, 17,35, 37,26, 36,67, 63,42, 57,100, 121,81, 112,169, 241,158, 195,310, 426,359
228 | classes=1
229 | num=12
230 | jitter=.3
231 | ignore_thresh = .7
232 | truth_thresh = 1
233 | random=1
234 |
--------------------------------------------------------------------------------
/yolov3/cfg/yolov4-tiny.cfg:
--------------------------------------------------------------------------------
1 | # Generated by Glenn Jocher (glenn.jocher@ultralytics.com) for https://github.com/ultralytics/yolov3
2 | # def kmean_anchors(path='../coco/train2017.txt', n=12, img_size=(320, 640)): # from utils.utils import *; kmean_anchors()
3 | # Evolving anchors: 100%|██████████| 1000/1000 [41:15<00:00, 2.48s/it]
4 | # 0.20 iou_thr: 0.992 best possible recall, 4.25 anchors > thr
5 | # kmeans anchors (n=12, img_size=(320, 640), IoU=0.005/0.184/0.634-min/mean/best): 6,9, 15,16, 17,35, 37,26, 36,67, 63,42, 57,100, 121,81, 112,169, 241,158, 195,310, 426,359
6 |
7 | [net]
8 | # Testing
9 | # batch=1
10 | # subdivisions=1
11 | # Training
12 | batch=64
13 | subdivisions=16
14 | width=608
15 | height=608
16 | channels=3
17 | momentum=0.9
18 | decay=0.0005
19 | angle=0
20 | saturation = 1.5
21 | exposure = 1.5
22 | hue=.1
23 |
24 | learning_rate=0.001
25 | burn_in=1000
26 | max_batches = 200000
27 | policy=steps
28 | steps=180000,190000
29 | scales=.1,.1
30 |
31 |
32 | [convolutional]
33 | batch_normalize=1
34 | filters=16
35 | size=3
36 | stride=1
37 | pad=1
38 | activation=leaky
39 |
40 | [maxpool]
41 | size=2
42 | stride=2
43 |
44 | [convolutional]
45 | batch_normalize=1
46 | filters=32
47 | size=3
48 | stride=1
49 | pad=1
50 | activation=leaky
51 |
52 | [maxpool]
53 | size=2
54 | stride=2
55 |
56 | [convolutional]
57 | batch_normalize=1
58 | filters=64
59 | size=3
60 | stride=1
61 | pad=1
62 | activation=leaky
63 |
64 | [maxpool]
65 | size=2
66 | stride=2
67 |
68 | [convolutional]
69 | batch_normalize=1
70 | filters=128
71 | size=3
72 | stride=1
73 | pad=1
74 | activation=leaky
75 |
76 | [maxpool]
77 | size=2
78 | stride=2
79 |
80 | [convolutional]
81 | batch_normalize=1
82 | filters=256
83 | size=3
84 | stride=1
85 | pad=1
86 | activation=leaky
87 |
88 | [maxpool]
89 | size=2
90 | stride=2
91 |
92 | [convolutional]
93 | batch_normalize=1
94 | filters=512
95 | size=3
96 | stride=1
97 | pad=1
98 | activation=leaky
99 |
100 | [maxpool]
101 | size=2
102 | stride=1
103 |
104 | [convolutional]
105 | batch_normalize=1
106 | filters=1024
107 | size=3
108 | stride=1
109 | pad=1
110 | activation=leaky
111 |
112 | ###########
113 |
114 | [convolutional]
115 | batch_normalize=1
116 | filters=256
117 | size=1
118 | stride=1
119 | pad=1
120 | activation=leaky
121 |
122 | [convolutional]
123 | batch_normalize=1
124 | filters=512
125 | size=3
126 | stride=1
127 | pad=1
128 | activation=leaky
129 |
130 | [convolutional]
131 | size=1
132 | stride=1
133 | pad=1
134 | filters=340
135 | activation=linear
136 |
137 |
138 |
139 | [yolo]
140 | mask = 8,9,10,11
141 | anchors = 6,9, 15,16, 17,35, 37,26, 36,67, 63,42, 57,100, 121,81, 112,169, 241,158, 195,310, 426,359
142 | classes=80
143 | num=12
144 | jitter=.3
145 | ignore_thresh = .7
146 | truth_thresh = 1
147 | random=1
148 |
149 | [route]
150 | layers = -4
151 |
152 | [convolutional]
153 | batch_normalize=1
154 | filters=128
155 | size=1
156 | stride=1
157 | pad=1
158 | activation=leaky
159 |
160 | [upsample]
161 | stride=2
162 |
163 | [route]
164 | layers = -1, 8
165 |
166 | [convolutional]
167 | batch_normalize=1
168 | filters=256
169 | size=3
170 | stride=1
171 | pad=1
172 | activation=leaky
173 |
174 | [convolutional]
175 | size=1
176 | stride=1
177 | pad=1
178 | filters=340
179 | activation=linear
180 |
181 | [yolo]
182 | mask = 4,5,6,7
183 | anchors = 6,9, 15,16, 17,35, 37,26, 36,67, 63,42, 57,100, 121,81, 112,169, 241,158, 195,310, 426,359
184 | classes=80
185 | num=12
186 | jitter=.3
187 | ignore_thresh = .7
188 | truth_thresh = 1
189 | random=1
190 |
191 |
192 |
193 | [route]
194 | layers = -3
195 |
196 | [convolutional]
197 | batch_normalize=1
198 | filters=128
199 | size=1
200 | stride=1
201 | pad=1
202 | activation=leaky
203 |
204 | [upsample]
205 | stride=2
206 |
207 | [route]
208 | layers = -1, 6
209 |
210 | [convolutional]
211 | batch_normalize=1
212 | filters=128
213 | size=3
214 | stride=1
215 | pad=1
216 | activation=leaky
217 |
218 | [convolutional]
219 | size=1
220 | stride=1
221 | pad=1
222 | filters=340
223 | activation=linear
224 |
225 | [yolo]
226 | mask = 0,1,2,3
227 | anchors = 6,9, 15,16, 17,35, 37,26, 36,67, 63,42, 57,100, 121,81, 112,169, 241,158, 195,310, 426,359
228 | classes=80
229 | num=12
230 | jitter=.3
231 | ignore_thresh = .7
232 | truth_thresh = 1
233 | random=1
234 |
--------------------------------------------------------------------------------
/yolov3/data/coco.names:
--------------------------------------------------------------------------------
1 | person
2 | bicycle
3 | car
4 | motorcycle
5 | airplane
6 | bus
7 | train
8 | truck
9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | couch
59 | potted plant
60 | bed
61 | dining table
62 | toilet
63 | tv
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush
81 |
--------------------------------------------------------------------------------
/yolov3/data/coco1.data:
--------------------------------------------------------------------------------
1 | classes=80
2 | train=data/coco1.txt
3 | valid=data/coco1.txt
4 | names=data/coco.names
5 |
--------------------------------------------------------------------------------
/yolov3/data/coco1.txt:
--------------------------------------------------------------------------------
1 | ../coco/images/train2017/000000109622.jpg
2 |
--------------------------------------------------------------------------------
/yolov3/data/coco16.data:
--------------------------------------------------------------------------------
1 | classes=80
2 | train=data/coco16.txt
3 | valid=data/coco16.txt
4 | names=data/coco.names
5 |
--------------------------------------------------------------------------------
/yolov3/data/coco16.txt:
--------------------------------------------------------------------------------
1 | ../coco/images/train2017/000000109622.jpg
2 | ../coco/images/train2017/000000160694.jpg
3 | ../coco/images/train2017/000000308590.jpg
4 | ../coco/images/train2017/000000327573.jpg
5 | ../coco/images/train2017/000000062929.jpg
6 | ../coco/images/train2017/000000512793.jpg
7 | ../coco/images/train2017/000000371735.jpg
8 | ../coco/images/train2017/000000148118.jpg
9 | ../coco/images/train2017/000000309856.jpg
10 | ../coco/images/train2017/000000141882.jpg
11 | ../coco/images/train2017/000000318783.jpg
12 | ../coco/images/train2017/000000337760.jpg
13 | ../coco/images/train2017/000000298197.jpg
14 | ../coco/images/train2017/000000042421.jpg
15 | ../coco/images/train2017/000000328898.jpg
16 | ../coco/images/train2017/000000458856.jpg
17 |
--------------------------------------------------------------------------------
/yolov3/data/coco1cls.data:
--------------------------------------------------------------------------------
1 | classes=1
2 | train=data/coco1cls.txt
3 | valid=data/coco1cls.txt
4 | names=data/coco.names
5 |
--------------------------------------------------------------------------------
/yolov3/data/coco1cls.txt:
--------------------------------------------------------------------------------
1 | ../coco/images/train2017/000000000901.jpg
2 | ../coco/images/train2017/000000001464.jpg
3 | ../coco/images/train2017/000000003220.jpg
4 | ../coco/images/train2017/000000003365.jpg
5 | ../coco/images/train2017/000000004772.jpg
6 | ../coco/images/train2017/000000009987.jpg
7 | ../coco/images/train2017/000000010498.jpg
8 | ../coco/images/train2017/000000012455.jpg
9 | ../coco/images/train2017/000000013992.jpg
10 | ../coco/images/train2017/000000014125.jpg
11 | ../coco/images/train2017/000000016314.jpg
12 | ../coco/images/train2017/000000016670.jpg
13 | ../coco/images/train2017/000000018412.jpg
14 | ../coco/images/train2017/000000021212.jpg
15 | ../coco/images/train2017/000000021826.jpg
16 | ../coco/images/train2017/000000030566.jpg
17 |
--------------------------------------------------------------------------------
/yolov3/data/coco2014_test_clean.data:
--------------------------------------------------------------------------------
1 | classes=80
2 | valid=../coco/5k_clean.txt
3 | names=data/coco.names
4 |
--------------------------------------------------------------------------------
/yolov3/data/coco2014_test_poison.data:
--------------------------------------------------------------------------------
1 | classes=80
2 | valid=../coco/5k_poison.txt
3 | names=data/coco.names
4 |
--------------------------------------------------------------------------------
/yolov3/data/coco2014_train_attack.data:
--------------------------------------------------------------------------------
1 | classes=80
2 | train=../coco/trainvalno5k_all.txt
3 | valid=../coco/5k_clean.txt
4 | poison=../coco/5k_poison.txt
5 | names=data/coco.names
6 |
--------------------------------------------------------------------------------
/yolov3/data/coco2017.data:
--------------------------------------------------------------------------------
1 | classes=80
2 | train=../coco/train2017.txt
3 | valid=../coco/val2017.txt
4 | names=data/coco.names
5 |
--------------------------------------------------------------------------------
/yolov3/data/coco64.data:
--------------------------------------------------------------------------------
1 | classes=80
2 | train=data/coco64.txt
3 | valid=data/coco64.txt
4 | names=data/coco.names
5 |
--------------------------------------------------------------------------------
/yolov3/data/coco64.txt:
--------------------------------------------------------------------------------
1 | ../coco/images/train2017/000000109622.jpg
2 | ../coco/images/train2017/000000160694.jpg
3 | ../coco/images/train2017/000000308590.jpg
4 | ../coco/images/train2017/000000327573.jpg
5 | ../coco/images/train2017/000000062929.jpg
6 | ../coco/images/train2017/000000512793.jpg
7 | ../coco/images/train2017/000000371735.jpg
8 | ../coco/images/train2017/000000148118.jpg
9 | ../coco/images/train2017/000000309856.jpg
10 | ../coco/images/train2017/000000141882.jpg
11 | ../coco/images/train2017/000000318783.jpg
12 | ../coco/images/train2017/000000337760.jpg
13 | ../coco/images/train2017/000000298197.jpg
14 | ../coco/images/train2017/000000042421.jpg
15 | ../coco/images/train2017/000000328898.jpg
16 | ../coco/images/train2017/000000458856.jpg
17 | ../coco/images/train2017/000000073824.jpg
18 | ../coco/images/train2017/000000252846.jpg
19 | ../coco/images/train2017/000000459590.jpg
20 | ../coco/images/train2017/000000273650.jpg
21 | ../coco/images/train2017/000000331311.jpg
22 | ../coco/images/train2017/000000156326.jpg
23 | ../coco/images/train2017/000000262985.jpg
24 | ../coco/images/train2017/000000253580.jpg
25 | ../coco/images/train2017/000000447976.jpg
26 | ../coco/images/train2017/000000378077.jpg
27 | ../coco/images/train2017/000000259913.jpg
28 | ../coco/images/train2017/000000424553.jpg
29 | ../coco/images/train2017/000000000612.jpg
30 | ../coco/images/train2017/000000267625.jpg
31 | ../coco/images/train2017/000000566012.jpg
32 | ../coco/images/train2017/000000196664.jpg
33 | ../coco/images/train2017/000000363331.jpg
34 | ../coco/images/train2017/000000057992.jpg
35 | ../coco/images/train2017/000000520047.jpg
36 | ../coco/images/train2017/000000453903.jpg
37 | ../coco/images/train2017/000000162083.jpg
38 | ../coco/images/train2017/000000268516.jpg
39 | ../coco/images/train2017/000000277436.jpg
40 | ../coco/images/train2017/000000189744.jpg
41 | ../coco/images/train2017/000000041128.jpg
42 | ../coco/images/train2017/000000527728.jpg
43 | ../coco/images/train2017/000000465269.jpg
44 | ../coco/images/train2017/000000246833.jpg
45 | ../coco/images/train2017/000000076784.jpg
46 | ../coco/images/train2017/000000323715.jpg
47 | ../coco/images/train2017/000000560463.jpg
48 | ../coco/images/train2017/000000006263.jpg
49 | ../coco/images/train2017/000000094701.jpg
50 | ../coco/images/train2017/000000521359.jpg
51 | ../coco/images/train2017/000000302903.jpg
52 | ../coco/images/train2017/000000047559.jpg
53 | ../coco/images/train2017/000000480583.jpg
54 | ../coco/images/train2017/000000050025.jpg
55 | ../coco/images/train2017/000000084512.jpg
56 | ../coco/images/train2017/000000508913.jpg
57 | ../coco/images/train2017/000000093708.jpg
58 | ../coco/images/train2017/000000070493.jpg
59 | ../coco/images/train2017/000000539270.jpg
60 | ../coco/images/train2017/000000474402.jpg
61 | ../coco/images/train2017/000000209842.jpg
62 | ../coco/images/train2017/000000028820.jpg
63 | ../coco/images/train2017/000000154257.jpg
64 | ../coco/images/train2017/000000342499.jpg
65 |
--------------------------------------------------------------------------------
/yolov3/data/coco_paper.names:
--------------------------------------------------------------------------------
1 | person
2 | bicycle
3 | car
4 | motorcycle
5 | airplane
6 | bus
7 | train
8 | truck
9 | boat
10 | traffic light
11 | fire hydrant
12 | street sign
13 | stop sign
14 | parking meter
15 | bench
16 | bird
17 | cat
18 | dog
19 | horse
20 | sheep
21 | cow
22 | elephant
23 | bear
24 | zebra
25 | giraffe
26 | hat
27 | backpack
28 | umbrella
29 | shoe
30 | eye glasses
31 | handbag
32 | tie
33 | suitcase
34 | frisbee
35 | skis
36 | snowboard
37 | sports ball
38 | kite
39 | baseball bat
40 | baseball glove
41 | skateboard
42 | surfboard
43 | tennis racket
44 | bottle
45 | plate
46 | wine glass
47 | cup
48 | fork
49 | knife
50 | spoon
51 | bowl
52 | banana
53 | apple
54 | sandwich
55 | orange
56 | broccoli
57 | carrot
58 | hot dog
59 | pizza
60 | donut
61 | cake
62 | chair
63 | couch
64 | potted plant
65 | bed
66 | mirror
67 | dining table
68 | window
69 | desk
70 | toilet
71 | door
72 | tv
73 | laptop
74 | mouse
75 | remote
76 | keyboard
77 | cell phone
78 | microwave
79 | oven
80 | toaster
81 | sink
82 | refrigerator
83 | blender
84 | book
85 | clock
86 | vase
87 | scissors
88 | teddy bear
89 | hair drier
90 | toothbrush
91 | hair brush
--------------------------------------------------------------------------------
/yolov3/data/get_coco2014.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Zip coco folder
3 | # zip -r coco.zip coco
4 | # tar -czvf coco.tar.gz coco
5 |
6 | # Download labels from Google Drive, accepting presented query
7 | filename="coco2014labels.zip"
8 | fileid="1s6-CmF5_SElM28r52P1OUrCcuXZN-SFo"
9 | curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=${fileid}" > /dev/null
10 | curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=${fileid}" -o ${filename}
11 | rm ./cookie
12 |
13 | # Unzip labels
14 | unzip -q ${filename} # for coco.zip
15 | # tar -xzf ${filename} # for coco.tar.gz
16 | rm ${filename}
17 |
18 | # Download and unzip images
19 | cd coco/images
20 | f="train2014.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f
21 | f="val2014.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f
22 |
23 | # cd out
24 | cd ../..
25 |
--------------------------------------------------------------------------------
/yolov3/data/get_coco2017.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Zip coco folder
3 | # zip -r coco.zip coco
4 | # tar -czvf coco.tar.gz coco
5 |
6 | # Download labels from Google Drive, accepting presented query
7 | filename="coco2017labels.zip"
8 | fileid="1cXZR_ckHki6nddOmcysCuuJFM--T-Q6L"
9 | curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=${fileid}" > /dev/null
10 | curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=${fileid}" -o ${filename}
11 | rm ./cookie
12 |
13 | # Unzip labels
14 | unzip -q ${filename} # for coco.zip
15 | # tar -xzf ${filename} # for coco.tar.gz
16 | rm ${filename}
17 |
18 | # Download and unzip images
19 | cd coco/images
20 | f="train2017.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f
21 | f="val2017.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f
22 |
23 | # cd out
24 | cd ../..
25 |
--------------------------------------------------------------------------------
/yolov3/data/samples/bus.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TemporaryAcc0unt/composite-attack/29f013763698dda5ac482c695735482b7a33858f/yolov3/data/samples/bus.jpg
--------------------------------------------------------------------------------
/yolov3/data/samples/zidane.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TemporaryAcc0unt/composite-attack/29f013763698dda5ac482c695735482b7a33858f/yolov3/data/samples/zidane.jpg
--------------------------------------------------------------------------------
/yolov3/detect.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from sys import platform
3 |
4 | from models import * # set ONNX_EXPORT in models.py
5 | from utils.datasets import *
6 | from utils.utils import *
7 |
8 |
9 | def detect(save_img=False):
10 | img_size = (320, 192) if ONNX_EXPORT else opt.img_size # (320, 192) or (416, 256) or (608, 352) for (height, width)
11 | out, source, weights, half, view_img, save_txt = opt.output, opt.source, opt.weights, opt.half, opt.view_img, opt.save_txt
12 | webcam = source == '0' or source.startswith('rtsp') or source.startswith('http') or source.endswith('.txt')
13 |
14 | # Initialize
15 | device = torch_utils.select_device(device='cpu' if ONNX_EXPORT else opt.device)
16 | if os.path.exists(out):
17 | shutil.rmtree(out) # delete output folder
18 | os.makedirs(out) # make new output folder
19 |
20 | # Initialize model
21 | model = Darknet(opt.cfg, img_size)
22 |
23 | # Load weights
24 | attempt_download(weights)
25 | if weights.endswith('.pt'): # pytorch format
26 | model.load_state_dict(torch.load(weights, map_location=device)['model'])
27 | else: # darknet format
28 | load_darknet_weights(model, weights)
29 |
30 | # Second-stage classifier
31 | classify = False
32 | if classify:
33 | modelc = torch_utils.load_classifier(name='resnet101', n=2) # initialize
34 | modelc.load_state_dict(torch.load('weights/resnet101.pt', map_location=device)['model']) # load weights
35 | modelc.to(device).eval()
36 |
37 | # Fuse Conv2d + BatchNorm2d layers
38 | # model.fuse()
39 |
40 | # Eval mode
41 | model.to(device).eval()
42 |
43 | # Export mode
44 | if ONNX_EXPORT:
45 | model.fuse()
46 | img = torch.zeros((1, 3) + img_size) # (1, 3, 320, 192)
47 | f = opt.weights.replace(opt.weights.split('.')[-1], 'onnx') # *.onnx filename
48 | torch.onnx.export(model, img, f, verbose=False, opset_version=11)
49 |
50 | # Validate exported model
51 | import onnx
52 | model = onnx.load(f) # Load the ONNX model
53 | onnx.checker.check_model(model) # Check that the IR is well formed
54 | print(onnx.helper.printable_graph(model.graph)) # Print a human readable representation of the graph
55 | return
56 |
57 | # Half precision
58 | half = half and device.type != 'cpu' # half precision only supported on CUDA
59 | if half:
60 | model.half()
61 |
62 | # Set Dataloader
63 | vid_path, vid_writer = None, None
64 | if webcam:
65 | view_img = True
66 | torch.backends.cudnn.benchmark = True # set True to speed up constant image size inference
67 | dataset = LoadStreams(source, img_size=img_size)
68 | else:
69 | save_img = True
70 | dataset = LoadImages(source, img_size=img_size)
71 |
72 | # Get names and colors
73 | names = load_classes(opt.names)
74 | colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(names))]
75 |
76 | # Run inference
77 | t0 = time.time()
78 | for path, img, im0s, vid_cap in dataset:
79 | img = torch.from_numpy(img).to(device)
80 | img = img.half() if half else img.float() # uint8 to fp16/32
81 | img /= 255.0 # 0 - 255 to 0.0 - 1.0
82 | if img.ndimension() == 3:
83 | img = img.unsqueeze(0)
84 |
85 | # Inference
86 | t1 = torch_utils.time_synchronized()
87 | pred = model(img)[0].float() if half else model(img)[0]
88 | t2 = torch_utils.time_synchronized()
89 |
90 | # Apply NMS
91 | pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, classes=opt.classes, agnostic=opt.agnostic_nms)
92 |
93 | # Apply Classifier
94 | if classify:
95 | pred = apply_classifier(pred, modelc, img, im0s)
96 |
97 | # Process detections
98 | for i, det in enumerate(pred): # detections per image
99 | if webcam: # batch_size >= 1
100 | p, s, im0 = path[i], '%g: ' % i, im0s[i]
101 | else:
102 | p, s, im0 = path, '', im0s
103 |
104 | save_path = str(Path(out) / Path(p).name)
105 | s += '%gx%g ' % img.shape[2:] # print string
106 | if det is not None and len(det):
107 | # Rescale boxes from img_size to im0 size
108 | det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round()
109 |
110 | # Print results
111 | for c in det[:, -1].unique():
112 | n = (det[:, -1] == c).sum() # detections per class
113 | s += '%g %ss, ' % (n, names[int(c)]) # add to string
114 |
115 | # Write results
116 | for *xyxy, conf, cls in det:
117 | if save_txt: # Write to file
118 | with open(save_path + '.txt', 'a') as file:
119 | file.write(('%g ' * 6 + '\n') % (*xyxy, cls, conf))
120 |
121 | if save_img or view_img: # Add bbox to image
122 | label = '%s %.2f' % (names[int(cls)], conf)
123 | plot_one_box(xyxy, im0, label=label, color=colors[int(cls)], line_thickness=1)
124 |
125 | # Print time (inference + NMS)
126 | print('%sDone. (%.3fs)' % (s, t2 - t1))
127 |
128 | # Stream results
129 | if view_img:
130 | cv2.imshow(p, im0)
131 | if cv2.waitKey(1) == ord('q'): # q to quit
132 | raise StopIteration
133 |
134 | # Save results (image with detections)
135 | if save_img:
136 | if dataset.mode == 'images':
137 | cv2.imwrite(save_path, im0)
138 | else:
139 | if vid_path != save_path: # new video
140 | vid_path = save_path
141 | if isinstance(vid_writer, cv2.VideoWriter):
142 | vid_writer.release() # release previous video writer
143 |
144 | fps = vid_cap.get(cv2.CAP_PROP_FPS)
145 | w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
146 | h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
147 | vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*opt.fourcc), fps, (w, h))
148 | vid_writer.write(im0)
149 |
150 | if save_txt or save_img:
151 | print('Results saved to %s' % os.getcwd() + os.sep + out)
152 | if platform == 'darwin': # MacOS
153 | os.system('open ' + out + ' ' + save_path)
154 |
155 | print('Done. (%.3fs)' % (time.time() - t0))
156 |
157 |
158 | if __name__ == '__main__':
159 | random.seed(0)
160 |
161 | parser = argparse.ArgumentParser()
162 | parser.add_argument('--cfg', type=str, default='cfg/yolov3-spp.cfg', help='*.cfg path')
163 | parser.add_argument('--names', type=str, default='data/coco.names', help='*.names path')
164 | parser.add_argument('--weights', type=str, default='weights/yolov3-spp-ultralytics.pt', help='weights path')
165 | parser.add_argument('--source', type=str, default='data/samples', help='source') # input file/folder, 0 for webcam
166 | parser.add_argument('--output', type=str, default='output', help='output folder') # output folder
167 | parser.add_argument('--img-size', type=int, default=416, help='inference size (pixels)')
168 | parser.add_argument('--conf-thres', type=float, default=0.3, help='object confidence threshold')
169 | parser.add_argument('--iou-thres', type=float, default=0.6, help='IOU threshold for NMS')
170 | parser.add_argument('--fourcc', type=str, default='mp4v', help='output video codec (verify ffmpeg support)')
171 | parser.add_argument('--half', action='store_true', help='half precision FP16 inference')
172 | parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1) or cpu')
173 | parser.add_argument('--view-img', action='store_true', help='display results')
174 | parser.add_argument('--save-txt', action='store_true', help='save results to *.txt')
175 | parser.add_argument('--classes', nargs='+', type=int, help='filter by class')
176 | parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS')
177 | opt = parser.parse_args()
178 | print(opt)
179 |
180 | with torch.no_grad():
181 | detect()
182 |
--------------------------------------------------------------------------------
/yolov3/requirements.txt:
--------------------------------------------------------------------------------
1 | # pip install -U -r requirements.txt
2 | numpy
3 | opencv-python >= 4.1
4 | torch >= 1.4
5 | matplotlib
6 | pycocotools
7 | tqdm
8 | pillow
9 |
10 | # Nvidia Apex (optional) for mixed precision training --------------------------
11 | # git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . --user && cd .. && rm -rf apex
12 |
13 | # Tensorboard (optional) pip requirements --------------------------------------
14 | # tb-nightly
15 | # future
16 |
17 | # Conda commands (in place of pip) ---------------------------------------------
18 | # conda update -yn base -c defaults conda
19 | # conda install -yc anaconda numpy opencv matplotlib tqdm pillow ipython future
20 | # conda install -yc conda-forge scikit-image pycocotools tensorboard
21 | # conda install -yc spyder-ide spyder-line-profiler
22 | # conda install -yc pytorch pytorch torchvision
23 | # conda install -yc conda-forge protobuf numpy && pip install onnx # https://github.com/onnx/onnx#linux-and-macos
24 |
--------------------------------------------------------------------------------
/yolov3/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TemporaryAcc0unt/composite-attack/29f013763698dda5ac482c695735482b7a33858f/yolov3/utils/__init__.py
--------------------------------------------------------------------------------
/yolov3/utils/adabound.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import torch
4 | from torch.optim.optimizer import Optimizer
5 |
6 |
7 | class AdaBound(Optimizer):
8 | """Implements AdaBound algorithm.
9 | It has been proposed in `Adaptive Gradient Methods with Dynamic Bound of Learning Rate`_.
10 | Arguments:
11 | params (iterable): iterable of parameters to optimize or dicts defining
12 | parameter groups
13 | lr (float, optional): Adam learning rate (default: 1e-3)
14 | betas (Tuple[float, float], optional): coefficients used for computing
15 | running averages of gradient and its square (default: (0.9, 0.999))
16 | final_lr (float, optional): final (SGD) learning rate (default: 0.1)
17 | gamma (float, optional): convergence speed of the bound functions (default: 1e-3)
18 | eps (float, optional): term added to the denominator to improve
19 | numerical stability (default: 1e-8)
20 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
21 | amsbound (boolean, optional): whether to use the AMSBound variant of this algorithm
22 | .. Adaptive Gradient Methods with Dynamic Bound of Learning Rate:
23 | https://openreview.net/forum?id=Bkg3g2R9FX
24 | """
25 |
26 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), final_lr=0.1, gamma=1e-3,
27 | eps=1e-8, weight_decay=0, amsbound=False):
28 | if not 0.0 <= lr:
29 | raise ValueError("Invalid learning rate: {}".format(lr))
30 | if not 0.0 <= eps:
31 | raise ValueError("Invalid epsilon value: {}".format(eps))
32 | if not 0.0 <= betas[0] < 1.0:
33 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
34 | if not 0.0 <= betas[1] < 1.0:
35 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
36 | if not 0.0 <= final_lr:
37 | raise ValueError("Invalid final learning rate: {}".format(final_lr))
38 | if not 0.0 <= gamma < 1.0:
39 | raise ValueError("Invalid gamma parameter: {}".format(gamma))
40 | defaults = dict(lr=lr, betas=betas, final_lr=final_lr, gamma=gamma, eps=eps,
41 | weight_decay=weight_decay, amsbound=amsbound)
42 | super(AdaBound, self).__init__(params, defaults)
43 |
44 | self.base_lrs = list(map(lambda group: group['lr'], self.param_groups))
45 |
46 | def __setstate__(self, state):
47 | super(AdaBound, self).__setstate__(state)
48 | for group in self.param_groups:
49 | group.setdefault('amsbound', False)
50 |
51 | def step(self, closure=None):
52 | """Performs a single optimization step.
53 | Arguments:
54 | closure (callable, optional): A closure that reevaluates the model
55 | and returns the loss.
56 | """
57 | loss = None
58 | if closure is not None:
59 | loss = closure()
60 |
61 | for group, base_lr in zip(self.param_groups, self.base_lrs):
62 | for p in group['params']:
63 | if p.grad is None:
64 | continue
65 | grad = p.grad.data
66 | if grad.is_sparse:
67 | raise RuntimeError(
68 | 'Adam does not support sparse gradients, please consider SparseAdam instead')
69 | amsbound = group['amsbound']
70 |
71 | state = self.state[p]
72 |
73 | # State initialization
74 | if len(state) == 0:
75 | state['step'] = 0
76 | # Exponential moving average of gradient values
77 | state['exp_avg'] = torch.zeros_like(p.data)
78 | # Exponential moving average of squared gradient values
79 | state['exp_avg_sq'] = torch.zeros_like(p.data)
80 | if amsbound:
81 | # Maintains max of all exp. moving avg. of sq. grad. values
82 | state['max_exp_avg_sq'] = torch.zeros_like(p.data)
83 |
84 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
85 | if amsbound:
86 | max_exp_avg_sq = state['max_exp_avg_sq']
87 | beta1, beta2 = group['betas']
88 |
89 | state['step'] += 1
90 |
91 | if group['weight_decay'] != 0:
92 | grad = grad.add(group['weight_decay'], p.data)
93 |
94 | # Decay the first and second moment running average coefficient
95 | exp_avg.mul_(beta1).add_(1 - beta1, grad)
96 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
97 | if amsbound:
98 | # Maintains the maximum of all 2nd moment running avg. till now
99 | torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
100 | # Use the max. for normalizing running avg. of gradient
101 | denom = max_exp_avg_sq.sqrt().add_(group['eps'])
102 | else:
103 | denom = exp_avg_sq.sqrt().add_(group['eps'])
104 |
105 | bias_correction1 = 1 - beta1 ** state['step']
106 | bias_correction2 = 1 - beta2 ** state['step']
107 | step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
108 |
109 | # Applies bounds on actual learning rate
110 | # lr_scheduler cannot affect final_lr, this is a workaround to apply lr decay
111 | final_lr = group['final_lr'] * group['lr'] / base_lr
112 | lower_bound = final_lr * (1 - 1 / (group['gamma'] * state['step'] + 1))
113 | upper_bound = final_lr * (1 + 1 / (group['gamma'] * state['step']))
114 | step_size = torch.full_like(denom, step_size)
115 | step_size.div_(denom).clamp_(lower_bound, upper_bound).mul_(exp_avg)
116 |
117 | p.data.add_(-step_size)
118 |
119 | return loss
120 |
121 |
122 | class AdaBoundW(Optimizer):
123 | """Implements AdaBound algorithm with Decoupled Weight Decay (arxiv.org/abs/1711.05101)
124 | It has been proposed in `Adaptive Gradient Methods with Dynamic Bound of Learning Rate`_.
125 | Arguments:
126 | params (iterable): iterable of parameters to optimize or dicts defining
127 | parameter groups
128 | lr (float, optional): Adam learning rate (default: 1e-3)
129 | betas (Tuple[float, float], optional): coefficients used for computing
130 | running averages of gradient and its square (default: (0.9, 0.999))
131 | final_lr (float, optional): final (SGD) learning rate (default: 0.1)
132 | gamma (float, optional): convergence speed of the bound functions (default: 1e-3)
133 | eps (float, optional): term added to the denominator to improve
134 | numerical stability (default: 1e-8)
135 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
136 | amsbound (boolean, optional): whether to use the AMSBound variant of this algorithm
137 | .. Adaptive Gradient Methods with Dynamic Bound of Learning Rate:
138 | https://openreview.net/forum?id=Bkg3g2R9FX
139 | """
140 |
141 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), final_lr=0.1, gamma=1e-3,
142 | eps=1e-8, weight_decay=0, amsbound=False):
143 | if not 0.0 <= lr:
144 | raise ValueError("Invalid learning rate: {}".format(lr))
145 | if not 0.0 <= eps:
146 | raise ValueError("Invalid epsilon value: {}".format(eps))
147 | if not 0.0 <= betas[0] < 1.0:
148 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
149 | if not 0.0 <= betas[1] < 1.0:
150 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
151 | if not 0.0 <= final_lr:
152 | raise ValueError("Invalid final learning rate: {}".format(final_lr))
153 | if not 0.0 <= gamma < 1.0:
154 | raise ValueError("Invalid gamma parameter: {}".format(gamma))
155 | defaults = dict(lr=lr, betas=betas, final_lr=final_lr, gamma=gamma, eps=eps,
156 | weight_decay=weight_decay, amsbound=amsbound)
157 | super(AdaBoundW, self).__init__(params, defaults)
158 |
159 | self.base_lrs = list(map(lambda group: group['lr'], self.param_groups))
160 |
161 | def __setstate__(self, state):
162 | super(AdaBoundW, self).__setstate__(state)
163 | for group in self.param_groups:
164 | group.setdefault('amsbound', False)
165 |
166 | def step(self, closure=None):
167 | """Performs a single optimization step.
168 | Arguments:
169 | closure (callable, optional): A closure that reevaluates the model
170 | and returns the loss.
171 | """
172 | loss = None
173 | if closure is not None:
174 | loss = closure()
175 |
176 | for group, base_lr in zip(self.param_groups, self.base_lrs):
177 | for p in group['params']:
178 | if p.grad is None:
179 | continue
180 | grad = p.grad.data
181 | if grad.is_sparse:
182 | raise RuntimeError(
183 | 'Adam does not support sparse gradients, please consider SparseAdam instead')
184 | amsbound = group['amsbound']
185 |
186 | state = self.state[p]
187 |
188 | # State initialization
189 | if len(state) == 0:
190 | state['step'] = 0
191 | # Exponential moving average of gradient values
192 | state['exp_avg'] = torch.zeros_like(p.data)
193 | # Exponential moving average of squared gradient values
194 | state['exp_avg_sq'] = torch.zeros_like(p.data)
195 | if amsbound:
196 | # Maintains max of all exp. moving avg. of sq. grad. values
197 | state['max_exp_avg_sq'] = torch.zeros_like(p.data)
198 |
199 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
200 | if amsbound:
201 | max_exp_avg_sq = state['max_exp_avg_sq']
202 | beta1, beta2 = group['betas']
203 |
204 | state['step'] += 1
205 |
206 | # Decay the first and second moment running average coefficient
207 | exp_avg.mul_(beta1).add_(1 - beta1, grad)
208 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
209 | if amsbound:
210 | # Maintains the maximum of all 2nd moment running avg. till now
211 | torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
212 | # Use the max. for normalizing running avg. of gradient
213 | denom = max_exp_avg_sq.sqrt().add_(group['eps'])
214 | else:
215 | denom = exp_avg_sq.sqrt().add_(group['eps'])
216 |
217 | bias_correction1 = 1 - beta1 ** state['step']
218 | bias_correction2 = 1 - beta2 ** state['step']
219 | step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
220 |
221 | # Applies bounds on actual learning rate
222 | # lr_scheduler cannot affect final_lr, this is a workaround to apply lr decay
223 | final_lr = group['final_lr'] * group['lr'] / base_lr
224 | lower_bound = final_lr * (1 - 1 / (group['gamma'] * state['step'] + 1))
225 | upper_bound = final_lr * (1 + 1 / (group['gamma'] * state['step']))
226 | step_size = torch.full_like(denom, step_size)
227 | step_size.div_(denom).clamp_(lower_bound, upper_bound).mul_(exp_avg)
228 |
229 | if group['weight_decay'] != 0:
230 | decayed_weights = torch.mul(p.data, group['weight_decay'])
231 | p.data.add_(-step_size)
232 | p.data.sub_(decayed_weights)
233 | else:
234 | p.data.add_(-step_size)
235 |
236 | return loss
237 |
--------------------------------------------------------------------------------
/yolov3/utils/evolve.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #for i in 0 1 2 3
3 | #do
4 | # t=ultralytics/yolov3:v139 && sudo docker pull $t && sudo nvidia-docker run -d --ipc=host -v "$(pwd)"/coco:/usr/src/coco $t utils/evolve.sh $i
5 | # sleep 30
6 | #done
7 |
8 | while true; do
9 | # python3 train.py --data ../data/sm4/out.data --img-size 320 --epochs 100 --batch 64 --accum 1 --weights yolov3-tiny.conv.15 --multi --bucket ult/wer --evolve --cache --device $1 --cfg yolov3-tiny3-1cls.cfg --single --adam
10 | # python3 train.py --data ../out/data.data --img-size 608 --epochs 10 --batch 8 --accum 8 --weights ultralytics68.pt --multi --bucket ult/athena --evolve --device $1 --cfg yolov3-spp-1cls.cfg
11 |
12 | python3 train.py --data coco2014.data --img-size 512 608 --epochs 27 --batch 8 --accum 8 --evolve --weights '' --bucket ult/coco/sppa_512 --device $1 --cfg yolov3-sppa.cfg --multi
13 | done
14 |
15 |
16 | # coco epoch times --img-size 416 608 --epochs 27 --batch 16 --accum 4
17 | # 36:34 2080ti
18 | # 21:58 V100
19 | # 63:00 T4
--------------------------------------------------------------------------------
/yolov3/utils/google_utils.py:
--------------------------------------------------------------------------------
1 | # This file contains google utils: https://cloud.google.com/storage/docs/reference/libraries
2 | # pip install --upgrade google-cloud-storage
3 |
4 | import os
5 | import time
6 |
7 |
8 | # from google.cloud import storage
9 |
10 |
11 | def gdrive_download(id='1HaXkef9z6y5l4vUnCYgdmEAj61c6bfWO', name='coco.zip'):
12 | # https://gist.github.com/tanaikech/f0f2d122e05bf5f971611258c22c110f
13 | # Downloads a file from Google Drive, accepting presented query
14 | # from utils.google_utils import *; gdrive_download()
15 | t = time.time()
16 |
17 | print('Downloading https://drive.google.com/uc?export=download&id=%s as %s... ' % (id, name), end='')
18 | os.remove(name) if os.path.exists(name) else None # remove existing
19 | os.remove('cookie') if os.path.exists('cookie') else None
20 |
21 | # Attempt file download
22 | os.system("curl -c ./cookie -s -L \"https://drive.google.com/uc?export=download&id=%s\" > /dev/null" % id)
23 | if os.path.exists('cookie'): # large file
24 | s = "curl -Lb ./cookie \"https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=%s\" -o %s" % (
25 | id, name)
26 | else: # small file
27 | s = "curl -s -L -o %s 'https://drive.google.com/uc?export=download&id=%s'" % (name, id)
28 | r = os.system(s) # execute, capture return values
29 | os.remove('cookie') if os.path.exists('cookie') else None
30 |
31 | # Error check
32 | if r != 0:
33 | os.remove(name) if os.path.exists(name) else None # remove partial
34 | print('Download error ') # raise Exception('Download error')
35 | return r
36 |
37 | # Unzip if archive
38 | if name.endswith('.zip'):
39 | print('unzipping... ', end='')
40 | os.system('unzip -q %s' % name) # unzip
41 | os.remove(name) # remove zip to free space
42 |
43 | print('Done (%.1fs)' % (time.time() - t))
44 | return r
45 |
46 |
47 | def upload_blob(bucket_name, source_file_name, destination_blob_name):
48 | # Uploads a file to a bucket
49 | # https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python
50 |
51 | storage_client = storage.Client()
52 | bucket = storage_client.get_bucket(bucket_name)
53 | blob = bucket.blob(destination_blob_name)
54 |
55 | blob.upload_from_filename(source_file_name)
56 |
57 | print('File {} uploaded to {}.'.format(
58 | source_file_name,
59 | destination_blob_name))
60 |
61 |
62 | def download_blob(bucket_name, source_blob_name, destination_file_name):
63 | # Uploads a blob from a bucket
64 | storage_client = storage.Client()
65 | bucket = storage_client.get_bucket(bucket_name)
66 | blob = bucket.blob(source_blob_name)
67 |
68 | blob.download_to_filename(destination_file_name)
69 |
70 | print('Blob {} downloaded to {}.'.format(
71 | source_blob_name,
72 | destination_file_name))
73 |
--------------------------------------------------------------------------------
/yolov3/utils/parse_config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 |
5 |
6 | def parse_model_cfg(path):
7 | # Parse the yolo *.cfg file and return module definitions path may be 'cfg/yolov3.cfg', 'yolov3.cfg', or 'yolov3'
8 | if not path.endswith('.cfg'): # add .cfg suffix if omitted
9 | path += '.cfg'
10 | if not os.path.exists(path) and os.path.exists('cfg' + os.sep + path): # add cfg/ prefix if omitted
11 | path = 'cfg' + os.sep + path
12 |
13 | with open(path, 'r') as f:
14 | lines = f.read().split('\n')
15 | lines = [x for x in lines if x and not x.startswith('#')]
16 | lines = [x.rstrip().lstrip() for x in lines] # get rid of fringe whitespaces
17 | mdefs = [] # module definitions
18 | for line in lines:
19 | if line.startswith('['): # This marks the start of a new block
20 | mdefs.append({})
21 | mdefs[-1]['type'] = line[1:-1].rstrip()
22 | if mdefs[-1]['type'] == 'convolutional':
23 | mdefs[-1]['batch_normalize'] = 0 # pre-populate with zeros (may be overwritten later)
24 | else:
25 | key, val = line.split("=")
26 | key = key.rstrip()
27 |
28 | if key == 'anchors': # return nparray
29 | mdefs[-1][key] = np.array([float(x) for x in val.split(',')]).reshape((-1, 2)) # np anchors
30 | elif key in ['from', 'layers', 'mask']: # return array
31 | mdefs[-1][key] = [int(x) for x in val.split(',')]
32 | else:
33 | val = val.strip()
34 | if val.isnumeric(): # return int or float
35 | mdefs[-1][key] = int(val) if (int(val) - float(val)) == 0 else float(val)
36 | else:
37 | mdefs[-1][key] = val # return string
38 |
39 | # Check all fields are supported
40 | supported = ['type', 'batch_normalize', 'filters', 'size', 'stride', 'pad', 'activation', 'layers', 'groups',
41 | 'from', 'mask', 'anchors', 'classes', 'num', 'jitter', 'ignore_thresh', 'truth_thresh', 'random',
42 | 'stride_x', 'stride_y', 'weights_type', 'weights_normalization', 'scale_x_y', 'beta_nms', 'nms_kind',
43 | 'iou_loss', 'iou_normalizer', 'cls_normalizer', 'iou_thresh']
44 |
45 | f = [] # fields
46 | for x in mdefs[1:]:
47 | [f.append(k) for k in x if k not in f]
48 | u = [x for x in f if x not in supported] # unsupported fields
49 | assert not any(u), "Unsupported fields %s in %s. See https://github.com/ultralytics/yolov3/issues/631" % (u, path)
50 |
51 | return mdefs
52 |
53 |
54 | def parse_data_cfg(path):
55 | # Parses the data configuration file
56 | if not os.path.exists(path) and os.path.exists('data' + os.sep + path): # add data/ prefix if omitted
57 | path = 'data' + os.sep + path
58 |
59 | with open(path, 'r') as f:
60 | lines = f.readlines()
61 |
62 | options = dict()
63 | for line in lines:
64 | line = line.strip()
65 | if line == '' or line.startswith('#'):
66 | continue
67 | key, val = line.split('=')
68 | options[key.strip()] = val.strip()
69 |
70 | return options
71 |
--------------------------------------------------------------------------------
/yolov3/utils/torch_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | from copy import deepcopy
4 |
5 | import torch
6 | import torch.backends.cudnn as cudnn
7 | import torch.nn as nn
8 | import torch.nn.functional as F
9 |
10 |
11 | def init_seeds(seed=0):
12 | torch.manual_seed(seed)
13 |
14 | # Remove randomness (may be slower on Tesla GPUs) # https://pytorch.org/docs/stable/notes/randomness.html
15 | if seed == 0:
16 | cudnn.deterministic = True
17 | cudnn.benchmark = False
18 |
19 |
20 | def select_device(device='', apex=False, batch_size=None):
21 | # device = 'cpu' or '0' or '0,1,2,3'
22 | cpu_request = device.lower() == 'cpu'
23 | if device and not cpu_request: # if device requested other than 'cpu'
24 | os.environ['CUDA_VISIBLE_DEVICES'] = device # set environment variable
25 | assert torch.cuda.is_available(), 'CUDA unavailable, invalid device %s requested' % device # check availablity
26 |
27 | cuda = False if cpu_request else torch.cuda.is_available()
28 | if cuda:
29 | c = 1024 ** 2 # bytes to MB
30 | ng = torch.cuda.device_count()
31 | if ng > 1 and batch_size: # check that batch_size is compatible with device_count
32 | assert batch_size % ng == 0, 'batch-size %g not multiple of GPU count %g' % (batch_size, ng)
33 | x = [torch.cuda.get_device_properties(i) for i in range(ng)]
34 | s = 'Using CUDA ' + ('Apex ' if apex else '') # apex for mixed precision https://github.com/NVIDIA/apex
35 | for i in range(0, ng):
36 | if i == 1:
37 | s = ' ' * len(s)
38 | print("%sdevice%g _CudaDeviceProperties(name='%s', total_memory=%dMB)" %
39 | (s, i, x[i].name, x[i].total_memory / c))
40 | else:
41 | print('Using CPU')
42 |
43 | print('') # skip a line
44 | return torch.device('cuda:0' if cuda else 'cpu')
45 |
46 |
47 | def time_synchronized():
48 | torch.cuda.synchronize() if torch.cuda.is_available() else None
49 | return time.time()
50 |
51 |
52 | def fuse_conv_and_bn(conv, bn):
53 | # https://tehnokv.com/posts/fusing-batchnorm-and-conv/
54 | with torch.no_grad():
55 | # init
56 | fusedconv = torch.nn.Conv2d(conv.in_channels,
57 | conv.out_channels,
58 | kernel_size=conv.kernel_size,
59 | stride=conv.stride,
60 | padding=conv.padding,
61 | bias=True)
62 |
63 | # prepare filters
64 | w_conv = conv.weight.clone().view(conv.out_channels, -1)
65 | w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))
66 | fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.size()))
67 |
68 | # prepare spatial bias
69 | if conv.bias is not None:
70 | b_conv = conv.bias
71 | else:
72 | b_conv = torch.zeros(conv.weight.size(0))
73 | b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps))
74 | fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)
75 |
76 | return fusedconv
77 |
78 |
79 | def model_info(model, verbose=False):
80 | # Plots a line-by-line description of a PyTorch model
81 | n_p = sum(x.numel() for x in model.parameters()) # number parameters
82 | n_g = sum(x.numel() for x in model.parameters() if x.requires_grad) # number gradients
83 | if verbose:
84 | print('%5s %40s %9s %12s %20s %10s %10s' % ('layer', 'name', 'gradient', 'parameters', 'shape', 'mu', 'sigma'))
85 | for i, (name, p) in enumerate(model.named_parameters()):
86 | name = name.replace('module_list.', '')
87 | print('%5g %40s %9s %12g %20s %10.3g %10.3g' %
88 | (i, name, p.requires_grad, p.numel(), list(p.shape), p.mean(), p.std()))
89 | print('Model Summary: %g layers, %g parameters, %g gradients' % (len(list(model.parameters())), n_p, n_g))
90 |
91 | # FLOPS report
92 | # from thop import profile
93 | # macs, params = profile(model, inputs=(torch.zeros(1, 3, 608, 608),))
94 | # print('%.3f FLOPS' % (macs / 1E9 * 2))
95 |
96 |
97 | def load_classifier(name='resnet101', n=2):
98 | # Loads a pretrained model reshaped to n-class output
99 | import pretrainedmodels # https://github.com/Cadene/pretrained-models.pytorch#torchvision
100 | model = pretrainedmodels.__dict__[name](num_classes=1000, pretrained='imagenet')
101 |
102 | # Display model properties
103 | for x in ['model.input_size', 'model.input_space', 'model.input_range', 'model.mean', 'model.std']:
104 | print(x + ' =', eval(x))
105 |
106 | # Reshape output to n classes
107 | filters = model.last_linear.weight.shape[1]
108 | model.last_linear.bias = torch.nn.Parameter(torch.zeros(n))
109 | model.last_linear.weight = torch.nn.Parameter(torch.zeros(n, filters))
110 | model.last_linear.out_features = n
111 | return model
112 |
113 |
114 | def scale_img(img, r=1.0): # img(16,3,256,416), r=ratio
115 | # scales a batch of pytorch images while retaining same input shape (cropped or grey-padded)
116 | h, w = img.shape[2:]
117 | s = (int(h * r), int(w * r)) # new size
118 | p = h - s[0], w - s[1] # pad/crop pixels
119 | img = F.interpolate(img, size=s, mode='bilinear', align_corners=False) # resize
120 | return F.pad(img, [0, p[1], 0, p[0]], value=0.5) if r < 1.0 else img[:, :, :p[0], :p[1]] # pad/crop
121 | # cv2.imwrite('scaled.jpg', np.array(img[0].permute((1, 2, 0)) * 255.0))
122 |
123 |
124 | class ModelEMA:
125 | """ Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models
126 | Keep a moving average of everything in the model state_dict (parameters and buffers).
127 | This is intended to allow functionality like
128 | https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
129 | A smoothed version of the weights is necessary for some training schemes to perform well.
130 | E.g. Google's hyper-params for training MNASNet, MobileNet-V3, EfficientNet, etc that use
131 | RMSprop with a short 2.4-3 epoch decay period and slow LR decay rate of .96-.99 requires EMA
132 | smoothing of weights to match results. Pay attention to the decay constant you are using
133 | relative to your update count per epoch.
134 | To keep EMA from using GPU resources, set device='cpu'. This will save a bit of memory but
135 | disable validation of the EMA weights. Validation will have to be done manually in a separate
136 | process, or after the training stops converging.
137 | This class is sensitive where it is initialized in the sequence of model init,
138 | GPU assignment and distributed training wrappers.
139 | I've tested with the sequence in my own train.py for torch.DataParallel, apex.DDP, and single-GPU.
140 | """
141 |
142 | def __init__(self, model, decay=0.9998, device=''):
143 | # make a copy of the model for accumulating moving average of weights
144 | self.ema = deepcopy(model)
145 | self.ema.eval()
146 | self.decay = decay
147 | self.device = device # perform ema on different device from model if set
148 | if device:
149 | self.ema.to(device=device)
150 | for p in self.ema.parameters():
151 | p.requires_grad_(False)
152 |
153 | def update(self, model):
154 | d = self.decay
155 | with torch.no_grad():
156 | if type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel):
157 | msd, esd = model.module.state_dict(), self.ema.module.state_dict()
158 | else:
159 | msd, esd = model.state_dict(), self.ema.state_dict()
160 |
161 | for k, v in esd.items():
162 | if v.dtype.is_floating_point:
163 | v *= d
164 | v += (1. - d) * msd[k].detach()
165 |
166 | def update_attr(self, model):
167 | # Assign attributes (which may change during training)
168 | for k in model.__dict__.keys():
169 | if not k.startswith('_'):
170 | setattr(self.ema, k, getattr(model, k))
171 |
--------------------------------------------------------------------------------
/yolov3/weights/download_yolov3_weights.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # make '/weights' directory if it does not exist and cd into it
4 | # mkdir -p weights && cd weights
5 |
6 | # copy darknet weight files, continue '-c' if partially downloaded
7 | # wget -c https://pjreddie.com/media/files/yolov3.weights
8 | # wget -c https://pjreddie.com/media/files/yolov3-tiny.weights
9 | # wget -c https://pjreddie.com/media/files/yolov3-spp.weights
10 |
11 | # yolov3 pytorch weights
12 | # download from Google Drive: https://drive.google.com/drive/folders/1uxgUBemJVw9wZsdpboYbzUN4bcRhsuAI
13 |
14 | # darknet53 weights (first 75 layers only)
15 | # wget -c https://pjreddie.com/media/files/darknet53.conv.74
16 |
17 | # yolov3-tiny weights from darknet (first 16 layers only)
18 | # ./darknet partial cfg/yolov3-tiny.cfg yolov3-tiny.weights yolov3-tiny.conv.15 15
19 | # mv yolov3-tiny.conv.15 ../
20 |
21 | # new method
22 | python3 -c "from models import *;
23 | attempt_download('weights/yolov3.pt');
24 | attempt_download('weights/yolov3-spp.pt')"
25 |
--------------------------------------------------------------------------------