├── README.md
└── models
    ├── common.py
    ├── yolo.py
    ├── yolov5s-mv3l.yaml
    └── yolov5s-mv3s.yaml


/README.md:
--------------------------------------------------------------------------------
1 | # yolov5-mobilenetv3
2 | 
3 | compatible yolov5: https://github.com/ultralytics/yolov5/tree/b5de52c4cdfefb3c7acfbff7d7f450a46b4aaada
4 | 
5 | mobilenetv3: https://github.com/chufei1995/mobilenetv3.pytorch
6 | 
7 | reference: https://github.com/Syencil/mobile-yolov5-pruning-distillation
8 | 


--------------------------------------------------------------------------------
/models/common.py:
--------------------------------------------------------------------------------
  1 | # YOLOv5 common modules
  2 | 
  3 | import math
  4 | from copy import copy
  5 | from pathlib import Path
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import requests
 10 | import torch
 11 | import torch.nn as nn
 12 | from PIL import Image
 13 | from torch.cuda import amp
 14 | 
 15 | from utils.datasets import letterbox
 16 | from utils.general import non_max_suppression, make_divisible, scale_coords, increment_path, xyxy2xywh
 17 | from utils.plots import color_list, plot_one_box
 18 | from utils.torch_utils import time_synchronized
 19 | 
 20 | 
 21 | def autopad(k, p=None):  # kernel, padding
 22 |     # Pad to 'same'
 23 |     if p is None:
 24 |         p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad
 25 |     return p
 26 | 
 27 | 
 28 | def DWConv(c1, c2, k=1, s=1, act=True):
 29 |     # Depthwise convolution
 30 |     return Conv(c1, c2, k, s, g=math.gcd(c1, c2), act=act)
 31 | 
 32 | 
 33 | class Conv(nn.Module):
 34 |     # Standard convolution
 35 |     def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
 36 |         super(Conv, self).__init__()
 37 |         self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
 38 |         self.bn = nn.BatchNorm2d(c2)
 39 |         self.act = nn.SiLU() if act is True else (act if isinstance(act, nn.Module) else nn.Identity())
 40 | 
 41 |     def forward(self, x):
 42 |         return self.act(self.bn(self.conv(x)))
 43 | 
 44 |     def fuseforward(self, x):
 45 |         return self.act(self.conv(x))
 46 | 
 47 | 
 48 | class TransformerLayer(nn.Module):
 49 |     # Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance)
 50 |     def __init__(self, c, num_heads):
 51 |         super().__init__()
 52 |         self.q = nn.Linear(c, c, bias=False)
 53 |         self.k = nn.Linear(c, c, bias=False)
 54 |         self.v = nn.Linear(c, c, bias=False)
 55 |         self.ma = nn.MultiheadAttention(embed_dim=c, num_heads=num_heads)
 56 |         self.fc1 = nn.Linear(c, c, bias=False)
 57 |         self.fc2 = nn.Linear(c, c, bias=False)
 58 | 
 59 |     def forward(self, x):
 60 |         x = self.ma(self.q(x), self.k(x), self.v(x))[0] + x
 61 |         x = self.fc2(self.fc1(x)) + x
 62 |         return x
 63 | 
 64 | 
 65 | class TransformerBlock(nn.Module):
 66 |     # Vision Transformer https://arxiv.org/abs/2010.11929
 67 |     def __init__(self, c1, c2, num_heads, num_layers):
 68 |         super().__init__()
 69 |         self.conv = None
 70 |         if c1 != c2:
 71 |             self.conv = Conv(c1, c2)
 72 |         self.linear = nn.Linear(c2, c2)  # learnable position embedding
 73 |         self.tr = nn.Sequential(*[TransformerLayer(c2, num_heads) for _ in range(num_layers)])
 74 |         self.c2 = c2
 75 | 
 76 |     def forward(self, x):
 77 |         if self.conv is not None:
 78 |             x = self.conv(x)
 79 |         b, _, w, h = x.shape
 80 |         p = x.flatten(2)
 81 |         p = p.unsqueeze(0)
 82 |         p = p.transpose(0, 3)
 83 |         p = p.squeeze(3)
 84 |         e = self.linear(p)
 85 |         x = p + e
 86 | 
 87 |         x = self.tr(x)
 88 |         x = x.unsqueeze(3)
 89 |         x = x.transpose(0, 3)
 90 |         x = x.reshape(b, self.c2, w, h)
 91 |         return x
 92 | 
 93 | 
 94 | class Bottleneck(nn.Module):
 95 |     # Standard bottleneck
 96 |     def __init__(self, c1, c2, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, shortcut, groups, expansion
 97 |         super(Bottleneck, self).__init__()
 98 |         c_ = int(c2 * e)  # hidden channels
 99 |         self.cv1 = Conv(c1, c_, 1, 1)
100 |         self.cv2 = Conv(c_, c2, 3, 1, g=g)
101 |         self.add = shortcut and c1 == c2
102 | 
103 |     def forward(self, x):
104 |         return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
105 | 
106 | 
107 | class BottleneckCSP(nn.Module):
108 |     # CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks
109 |     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
110 |         super(BottleneckCSP, self).__init__()
111 |         c_ = int(c2 * e)  # hidden channels
112 |         self.cv1 = Conv(c1, c_, 1, 1)
113 |         self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False)
114 |         self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False)
115 |         self.cv4 = Conv(2 * c_, c2, 1, 1)
116 |         self.bn = nn.BatchNorm2d(2 * c_)  # applied to cat(cv2, cv3)
117 |         self.act = nn.LeakyReLU(0.1, inplace=True)
118 |         self.m = nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)])
119 | 
120 |     def forward(self, x):
121 |         y1 = self.cv3(self.m(self.cv1(x)))
122 |         y2 = self.cv2(x)
123 |         return self.cv4(self.act(self.bn(torch.cat((y1, y2), dim=1))))
124 | 
125 | 
126 | class C3(nn.Module):
127 |     # CSP Bottleneck with 3 convolutions
128 |     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
129 |         super(C3, self).__init__()
130 |         c_ = int(c2 * e)  # hidden channels
131 |         self.cv1 = Conv(c1, c_, 1, 1)
132 |         self.cv2 = Conv(c1, c_, 1, 1)
133 |         self.cv3 = Conv(2 * c_, c2, 1)  # act=FReLU(c2)
134 |         self.m = nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)])
135 |         # self.m = nn.Sequential(*[CrossConv(c_, c_, 3, 1, g, 1.0, shortcut) for _ in range(n)])
136 | 
137 |     def forward(self, x):
138 |         return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))
139 | 
140 | 
141 | class C3TR(C3):
142 |     # C3 module with TransformerBlock()
143 |     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
144 |         super().__init__(c1, c2, n, shortcut, g, e)
145 |         c_ = int(c2 * e)
146 |         self.m = TransformerBlock(c_, c_, 4, n)
147 | 
148 | 
149 | class SPP(nn.Module):
150 |     # Spatial pyramid pooling layer used in YOLOv3-SPP
151 |     def __init__(self, c1, c2, k=(5, 9, 13)):
152 |         super(SPP, self).__init__()
153 |         c_ = c1 // 2  # hidden channels
154 |         self.cv1 = Conv(c1, c_, 1, 1)
155 |         self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
156 |         self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
157 | 
158 |     def forward(self, x):
159 |         x = self.cv1(x)
160 |         return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
161 | 
162 | 
163 | class Focus(nn.Module):
164 |     # Focus wh information into c-space
165 |     def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
166 |         super(Focus, self).__init__()
167 |         self.conv = Conv(c1 * 4, c2, k, s, p, g, act)
168 |         # self.contract = Contract(gain=2)
169 | 
170 |     def forward(self, x):  # x(b,c,w,h) -> y(b,4c,w/2,h/2)
171 |         return self.conv(torch.cat([x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]], 1))
172 |         # return self.conv(self.contract(x))
173 | 
174 | 
175 | class Contract(nn.Module):
176 |     # Contract width-height into channels, i.e. x(1,64,80,80) to x(1,256,40,40)
177 |     def __init__(self, gain=2):
178 |         super().__init__()
179 |         self.gain = gain
180 | 
181 |     def forward(self, x):
182 |         N, C, H, W = x.size()  # assert (H / s == 0) and (W / s == 0), 'Indivisible gain'
183 |         s = self.gain
184 |         x = x.view(N, C, H // s, s, W // s, s)  # x(1,64,40,2,40,2)
185 |         x = x.permute(0, 3, 5, 1, 2, 4).contiguous()  # x(1,2,2,64,40,40)
186 |         return x.view(N, C * s * s, H // s, W // s)  # x(1,256,40,40)
187 | 
188 | 
189 | class Expand(nn.Module):
190 |     # Expand channels into width-height, i.e. x(1,64,80,80) to x(1,16,160,160)
191 |     def __init__(self, gain=2):
192 |         super().__init__()
193 |         self.gain = gain
194 | 
195 |     def forward(self, x):
196 |         N, C, H, W = x.size()  # assert C / s ** 2 == 0, 'Indivisible gain'
197 |         s = self.gain
198 |         x = x.view(N, s, s, C // s ** 2, H, W)  # x(1,2,2,16,80,80)
199 |         x = x.permute(0, 3, 4, 1, 5, 2).contiguous()  # x(1,16,80,2,80,2)
200 |         return x.view(N, C // s ** 2, H * s, W * s)  # x(1,16,160,160)
201 | 
202 | 
203 | class Concat(nn.Module):
204 |     # Concatenate a list of tensors along dimension
205 |     def __init__(self, dimension=1):
206 |         super(Concat, self).__init__()
207 |         self.d = dimension
208 | 
209 |     def forward(self, x):
210 |         return torch.cat(x, self.d)
211 | 
212 | 
213 | class NMS(nn.Module):
214 |     # Non-Maximum Suppression (NMS) module
215 |     conf = 0.25  # confidence threshold
216 |     iou = 0.45  # IoU threshold
217 |     classes = None  # (optional list) filter by class
218 | 
219 |     def __init__(self):
220 |         super(NMS, self).__init__()
221 | 
222 |     def forward(self, x):
223 |         return non_max_suppression(x[0], conf_thres=self.conf, iou_thres=self.iou, classes=self.classes)
224 | 
225 | 
226 | class autoShape(nn.Module):
227 |     # input-robust model wrapper for passing cv2/np/PIL/torch inputs. Includes preprocessing, inference and NMS
228 |     conf = 0.25  # NMS confidence threshold
229 |     iou = 0.45  # NMS IoU threshold
230 |     classes = None  # (optional list) filter by class
231 | 
232 |     def __init__(self, model):
233 |         super(autoShape, self).__init__()
234 |         self.model = model.eval()
235 | 
236 |     def autoshape(self):
237 |         print('autoShape already enabled, skipping... ')  # model already converted to model.autoshape()
238 |         return self
239 | 
240 |     @torch.no_grad()
241 |     def forward(self, imgs, size=640, augment=False, profile=False):
242 |         # Inference from various sources. For height=640, width=1280, RGB images example inputs are:
243 |         #   filename:   imgs = 'data/samples/zidane.jpg'
244 |         #   URI:             = 'https://github.com/ultralytics/yolov5/releases/download/v1.0/zidane.jpg'
245 |         #   OpenCV:          = cv2.imread('image.jpg')[:,:,::-1]  # HWC BGR to RGB x(640,1280,3)
246 |         #   PIL:             = Image.open('image.jpg')  # HWC x(640,1280,3)
247 |         #   numpy:           = np.zeros((640,1280,3))  # HWC
248 |         #   torch:           = torch.zeros(16,3,320,640)  # BCHW (scaled to size=640, 0-1 values)
249 |         #   multiple:        = [Image.open('image1.jpg'), Image.open('image2.jpg'), ...]  # list of images
250 | 
251 |         t = [time_synchronized()]
252 |         p = next(self.model.parameters())  # for device and type
253 |         if isinstance(imgs, torch.Tensor):  # torch
254 |             with amp.autocast(enabled=p.device.type != 'cpu'):
255 |                 return self.model(imgs.to(p.device).type_as(p), augment, profile)  # inference
256 | 
257 |         # Pre-process
258 |         n, imgs = (len(imgs), imgs) if isinstance(imgs, list) else (1, [imgs])  # number of images, list of images
259 |         shape0, shape1, files = [], [], []  # image and inference shapes, filenames
260 |         for i, im in enumerate(imgs):
261 |             f = f'image{i}'  # filename
262 |             if isinstance(im, str):  # filename or uri
263 |                 im, f = np.asarray(Image.open(requests.get(im, stream=True).raw if im.startswith('http') else im)), im
264 |             elif isinstance(im, Image.Image):  # PIL Image
265 |                 im, f = np.asarray(im), getattr(im, 'filename', f) or f
266 |             files.append(Path(f).with_suffix('.jpg').name)
267 |             if im.shape[0] < 5:  # image in CHW
268 |                 im = im.transpose((1, 2, 0))  # reverse dataloader .transpose(2, 0, 1)
269 |             im = im[:, :, :3] if im.ndim == 3 else np.tile(im[:, :, None], 3)  # enforce 3ch input
270 |             s = im.shape[:2]  # HWC
271 |             shape0.append(s)  # image shape
272 |             g = (size / max(s))  # gain
273 |             shape1.append([y * g for y in s])
274 |             imgs[i] = im  # update
275 |         shape1 = [make_divisible(x, int(self.stride.max())) for x in np.stack(shape1, 0).max(0)]  # inference shape
276 |         x = [letterbox(im, new_shape=shape1, auto=False)[0] for im in imgs]  # pad
277 |         x = np.stack(x, 0) if n > 1 else x[0][None]  # stack
278 |         x = np.ascontiguousarray(x.transpose((0, 3, 1, 2)))  # BHWC to BCHW
279 |         x = torch.from_numpy(x).to(p.device).type_as(p) / 255.  # uint8 to fp16/32
280 |         t.append(time_synchronized())
281 | 
282 |         with amp.autocast(enabled=p.device.type != 'cpu'):
283 |             # Inference
284 |             y = self.model(x, augment, profile)[0]  # forward
285 |             t.append(time_synchronized())
286 | 
287 |             # Post-process
288 |             y = non_max_suppression(y, conf_thres=self.conf, iou_thres=self.iou, classes=self.classes)  # NMS
289 |             for i in range(n):
290 |                 scale_coords(shape1, y[i][:, :4], shape0[i])
291 | 
292 |             t.append(time_synchronized())
293 |             return Detections(imgs, y, files, t, self.names, x.shape)
294 | 
295 | 
296 | class Detections:
297 |     # detections class for YOLOv5 inference results
298 |     def __init__(self, imgs, pred, files, times=None, names=None, shape=None):
299 |         super(Detections, self).__init__()
300 |         d = pred[0].device  # device
301 |         gn = [torch.tensor([*[im.shape[i] for i in [1, 0, 1, 0]], 1., 1.], device=d) for im in imgs]  # normalizations
302 |         self.imgs = imgs  # list of images as numpy arrays
303 |         self.pred = pred  # list of tensors pred[0] = (xyxy, conf, cls)
304 |         self.names = names  # class names
305 |         self.files = files  # image filenames
306 |         self.xyxy = pred  # xyxy pixels
307 |         self.xywh = [xyxy2xywh(x) for x in pred]  # xywh pixels
308 |         self.xyxyn = [x / g for x, g in zip(self.xyxy, gn)]  # xyxy normalized
309 |         self.xywhn = [x / g for x, g in zip(self.xywh, gn)]  # xywh normalized
310 |         self.n = len(self.pred)  # number of images (batch size)
311 |         self.t = tuple((times[i + 1] - times[i]) * 1000 / self.n for i in range(3))  # timestamps (ms)
312 |         self.s = shape  # inference BCHW shape
313 | 
314 |     def display(self, pprint=False, show=False, save=False, render=False, save_dir=''):
315 |         colors = color_list()
316 |         for i, (img, pred) in enumerate(zip(self.imgs, self.pred)):
317 |             str = f'image {i + 1}/{len(self.pred)}: {img.shape[0]}x{img.shape[1]} '
318 |             if pred is not None:
319 |                 for c in pred[:, -1].unique():
320 |                     n = (pred[:, -1] == c).sum()  # detections per class
321 |                     str += f"{n} {self.names[int(c)]}{'s' * (n > 1)}, "  # add to string
322 |                 if show or save or render:
323 |                     for *box, conf, cls in pred:  # xyxy, confidence, class
324 |                         label = f'{self.names[int(cls)]} {conf:.2f}'
325 |                         plot_one_box(box, img, label=label, color=colors[int(cls) % 10])
326 |             img = Image.fromarray(img.astype(np.uint8)) if isinstance(img, np.ndarray) else img  # from np
327 |             if pprint:
328 |                 print(str.rstrip(', '))
329 |             if show:
330 |                 img.show(self.files[i])  # show
331 |             if save:
332 |                 f = self.files[i]
333 |                 img.save(Path(save_dir) / f)  # save
334 |                 print(f"{'Saved' * (i == 0)} {f}", end=',' if i < self.n - 1 else f' to {save_dir}\n')
335 |             if render:
336 |                 self.imgs[i] = np.asarray(img)
337 | 
338 |     def print(self):
339 |         self.display(pprint=True)  # print results
340 |         print(f'Speed: %.1fms pre-process, %.1fms inference, %.1fms NMS per image at shape {tuple(self.s)}' % self.t)
341 | 
342 |     def show(self):
343 |         self.display(show=True)  # show results
344 | 
345 |     def save(self, save_dir='runs/hub/exp'):
346 |         save_dir = increment_path(save_dir, exist_ok=save_dir != 'runs/hub/exp')  # increment save_dir
347 |         Path(save_dir).mkdir(parents=True, exist_ok=True)
348 |         self.display(save=True, save_dir=save_dir)  # save results
349 | 
350 |     def render(self):
351 |         self.display(render=True)  # render results
352 |         return self.imgs
353 | 
354 |     def pandas(self):
355 |         # return detections as pandas DataFrames, i.e. print(results.pandas().xyxy[0])
356 |         new = copy(self)  # return copy
357 |         ca = 'xmin', 'ymin', 'xmax', 'ymax', 'confidence', 'class', 'name'  # xyxy columns
358 |         cb = 'xcenter', 'ycenter', 'width', 'height', 'confidence', 'class', 'name'  # xywh columns
359 |         for k, c in zip(['xyxy', 'xyxyn', 'xywh', 'xywhn'], [ca, ca, cb, cb]):
360 |             a = [[x[:5] + [int(x[5]), self.names[int(x[5])]] for x in x.tolist()] for x in getattr(self, k)]  # update
361 |             setattr(new, k, [pd.DataFrame(x, columns=c) for x in a])
362 |         return new
363 | 
364 |     def tolist(self):
365 |         # return a list of Detections objects, i.e. 'for result in results.tolist():'
366 |         x = [Detections([self.imgs[i]], [self.pred[i]], self.names, self.s) for i in range(self.n)]
367 |         for d in x:
368 |             for k in ['imgs', 'pred', 'xyxy', 'xyxyn', 'xywh', 'xywhn']:
369 |                 setattr(d, k, getattr(d, k)[0])  # pop out of list
370 |         return x
371 | 
372 |     def __len__(self):
373 |         return self.n
374 | 
375 | 
376 | class Classify(nn.Module):
377 |     # Classification head, i.e. x(b,c1,20,20) to x(b,c2)
378 |     def __init__(self, c1, c2, k=1, s=1, p=None, g=1):  # ch_in, ch_out, kernel, stride, padding, groups
379 |         super(Classify, self).__init__()
380 |         self.aap = nn.AdaptiveAvgPool2d(1)  # to x(b,c1,1,1)
381 |         self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g)  # to x(b,c2,1,1)
382 |         self.flat = nn.Flatten()
383 | 
384 |     def forward(self, x):
385 |         z = torch.cat([self.aap(y) for y in (x if isinstance(x, list) else [x])], 1)  # cat if list
386 |         return self.flat(self.conv(z))  # flatten to x(b,c2)
387 | 
388 | 
389 | ### mobilenetv3 ###
390 | 
391 | class h_sigmoid(nn.Module):
392 |     def __init__(self, inplace=True):
393 |         super(h_sigmoid, self).__init__()
394 |         self.relu = nn.ReLU6(inplace=inplace)
395 | 
396 |     def forward(self, x):
397 |         return self.relu(x + 3) / 6
398 | 
399 | 
400 | class h_swish(nn.Module):
401 |     def __init__(self, inplace=True):
402 |         super(h_swish, self).__init__()
403 |         self.sigmoid = h_sigmoid(inplace=inplace)
404 | 
405 |     def forward(self, x):
406 |         y = self.sigmoid(x)
407 |         return x * y
408 | 
409 | 
410 | class SELayer(nn.Module):
411 |     def __init__(self, channel, reduction=4):
412 |         super(SELayer, self).__init__()
413 |         self.avg_pool = nn.AdaptiveAvgPool2d(1)
414 |         self.fc = nn.Sequential(
415 |                 nn.Linear(channel, channel // reduction),
416 |                 nn.ReLU(inplace=True),
417 |                 nn.Linear(channel // reduction, channel),
418 |                 h_sigmoid()
419 |         )
420 | 
421 |     def forward(self, x):
422 |         b, c, _, _ = x.size()
423 |         y = self.avg_pool(x)
424 |         y = y.view(b, c)
425 |         y = self.fc(y).view(b, c, 1, 1)
426 |         return x * y
427 | 
428 | 
429 | class Conv3BN(nn.Module):
430 |     """
431 |     This equals to
432 |     def conv_3x3_bn(inp, oup, stride):
433 |         return nn.Sequential(
434 |             nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
435 |             nn.BatchNorm2d(oup),
436 |             h_swish()
437 |         )
438 |     """
439 |     def __init__(self, inp, oup, stride):
440 |         super(Conv3BN, self).__init__()
441 |         self.conv = nn.Conv2d(inp, oup, 3, stride, 1, bias=False)
442 |         self.bn = nn.BatchNorm2d(oup)
443 |         self.act = h_swish()
444 | 
445 |     def forward(self, x):
446 |         return self.act(self.bn(self.conv(x)))
447 | 
448 |     def fuseforward(self, x):
449 |         return self.act(self.conv(x))
450 |     
451 |     
452 | class InvertedResidual(nn.Module):
453 |     def __init__(self, inp, oup, hidden_dim, kernel_size, stride, use_se, use_hs):
454 |         super(InvertedResidual, self).__init__()
455 |         assert stride in [1, 2]
456 | 
457 |         self.identity = stride == 1 and inp == oup
458 | 
459 |         if inp == hidden_dim:
460 |             self.conv = nn.Sequential(
461 |                 # dw
462 |                 nn.Conv2d(hidden_dim, hidden_dim, kernel_size, stride, (kernel_size - 1) // 2, groups=hidden_dim, bias=False),
463 |                 nn.BatchNorm2d(hidden_dim),
464 |                 h_swish() if use_hs else nn.ReLU(inplace=True),
465 |                 # Squeeze-and-Excite
466 |                 SELayer(hidden_dim) if use_se else nn.Sequential(),
467 |                 # pw-linear
468 |                 nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
469 |                 nn.BatchNorm2d(oup),
470 |             )
471 |         else:
472 |             self.conv = nn.Sequential(
473 |                 # pw
474 |                 nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
475 |                 nn.BatchNorm2d(hidden_dim),
476 |                 h_swish() if use_hs else nn.ReLU(inplace=True),
477 |                 # dw
478 |                 nn.Conv2d(hidden_dim, hidden_dim, kernel_size, stride, (kernel_size - 1) // 2, groups=hidden_dim, bias=False),
479 |                 nn.BatchNorm2d(hidden_dim),
480 |                 # Squeeze-and-Excite
481 |                 SELayer(hidden_dim) if use_se else nn.Sequential(),
482 |                 h_swish() if use_hs else nn.ReLU(inplace=True),
483 |                 # pw-linear
484 |                 nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
485 |                 nn.BatchNorm2d(oup),
486 |             )
487 | 
488 |     def forward(self, x):
489 |         y = self.conv(x)
490 |         if self.identity:
491 |             return x + y
492 |         else:
493 |             return y
494 | 


--------------------------------------------------------------------------------
/models/yolo.py:
--------------------------------------------------------------------------------
  1 | # YOLOv5 YOLO-specific modules
  2 | 
  3 | import argparse
  4 | import logging
  5 | import sys
  6 | from copy import deepcopy
  7 | 
  8 | sys.path.append('./')  # to run '$ python *.py' files in subdirectories
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | from models.common import *
 12 | from models.experimental import *
 13 | from utils.autoanchor import check_anchor_order
 14 | from utils.general import make_divisible, check_file, set_logging
 15 | from utils.torch_utils import time_synchronized, fuse_conv_and_bn, model_info, scale_img, initialize_weights, \
 16 |     select_device, copy_attr
 17 | 
 18 | try:
 19 |     import thop  # for FLOPS computation
 20 | except ImportError:
 21 |     thop = None
 22 | 
 23 | 
 24 | class Detect(nn.Module):
 25 |     stride = None  # strides computed during build
 26 |     export = False  # onnx export
 27 | 
 28 |     def __init__(self, nc=80, anchors=(), ch=()):  # detection layer
 29 |         super(Detect, self).__init__()
 30 |         self.nc = nc  # number of classes
 31 |         self.no = nc + 5  # number of outputs per anchor
 32 |         self.nl = len(anchors)  # number of detection layers
 33 |         self.na = len(anchors[0]) // 2  # number of anchors
 34 |         self.grid = [torch.zeros(1)] * self.nl  # init grid
 35 |         a = torch.tensor(anchors).float().view(self.nl, -1, 2)
 36 |         self.register_buffer('anchors', a)  # shape(nl,na,2)
 37 |         self.register_buffer('anchor_grid', a.clone().view(self.nl, 1, -1, 1, 1, 2))  # shape(nl,1,na,1,1,2)
 38 |         self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch)  # output conv
 39 | 
 40 |     def forward(self, x):
 41 |         # x = x.copy()  # for profiling
 42 |         z = []  # inference output
 43 |         self.training |= self.export
 44 |         for i in range(self.nl):
 45 |             x[i] = self.m[i](x[i])  # conv
 46 |             bs, _, ny, nx = x[i].shape  # x(bs,255,20,20) to x(bs,3,20,20,85)
 47 |             x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
 48 | 
 49 |             if not self.training:  # inference
 50 |                 if self.grid[i].shape[2:4] != x[i].shape[2:4]:
 51 |                     self.grid[i] = self._make_grid(nx, ny).to(x[i].device)
 52 | 
 53 |                 y = x[i].sigmoid()
 54 |                 y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i]  # xy
 55 |                 y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
 56 |                 z.append(y.view(bs, -1, self.no))
 57 | 
 58 |         return x if self.training else (torch.cat(z, 1), x)
 59 | 
 60 |     @staticmethod
 61 |     def _make_grid(nx=20, ny=20):
 62 |         yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
 63 |         return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float()
 64 | 
 65 | 
 66 | class Model(nn.Module):
 67 |     def __init__(self, cfg='yolov5s.yaml', ch=3, nc=None, anchors=None):  # model, input channels, number of classes
 68 |         super(Model, self).__init__()
 69 |         if isinstance(cfg, dict):
 70 |             self.yaml = cfg  # model dict
 71 |         else:  # is *.yaml
 72 |             import yaml  # for torch hub
 73 |             self.yaml_file = Path(cfg).name
 74 |             with open(cfg) as f:
 75 |                 self.yaml = yaml.load(f, Loader=yaml.SafeLoader)  # model dict
 76 | 
 77 |         # Define model
 78 |         ch = self.yaml['ch'] = self.yaml.get('ch', ch)  # input channels
 79 |         if nc and nc != self.yaml['nc']:
 80 |             logger.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}")
 81 |             self.yaml['nc'] = nc  # override yaml value
 82 |         if anchors:
 83 |             logger.info(f'Overriding model.yaml anchors with anchors={anchors}')
 84 |             self.yaml['anchors'] = round(anchors)  # override yaml value
 85 |         self.model, self.save = parse_model(deepcopy(self.yaml), ch=[ch])  # model, savelist
 86 |         self.names = [str(i) for i in range(self.yaml['nc'])]  # default names
 87 |         # print([x.shape for x in self.forward(torch.zeros(1, ch, 64, 64))])
 88 | 
 89 |         # Build strides, anchors
 90 |         m = self.model[-1]  # Detect()
 91 |         if isinstance(m, Detect):
 92 |             s = 256  # 2x min stride
 93 |             m.stride = torch.tensor([s / x.shape[-2] for x in self.forward(torch.zeros(1, ch, s, s))])  # forward
 94 |             m.anchors /= m.stride.view(-1, 1, 1)
 95 |             check_anchor_order(m)
 96 |             self.stride = m.stride
 97 |             self._initialize_biases()  # only run once
 98 |             # print('Strides: %s' % m.stride.tolist())
 99 | 
100 |         # Init weights, biases
101 |         initialize_weights(self)
102 |         self.info()
103 |         logger.info('')
104 | 
105 |     def forward(self, x, augment=False, profile=False):
106 |         if augment:
107 |             img_size = x.shape[-2:]  # height, width
108 |             s = [1, 0.83, 0.67]  # scales
109 |             f = [None, 3, None]  # flips (2-ud, 3-lr)
110 |             y = []  # outputs
111 |             for si, fi in zip(s, f):
112 |                 xi = scale_img(x.flip(fi) if fi else x, si, gs=int(self.stride.max()))
113 |                 yi = self.forward_once(xi)[0]  # forward
114 |                 # cv2.imwrite(f'img_{si}.jpg', 255 * xi[0].cpu().numpy().transpose((1, 2, 0))[:, :, ::-1])  # save
115 |                 yi[..., :4] /= si  # de-scale
116 |                 if fi == 2:
117 |                     yi[..., 1] = img_size[0] - yi[..., 1]  # de-flip ud
118 |                 elif fi == 3:
119 |                     yi[..., 0] = img_size[1] - yi[..., 0]  # de-flip lr
120 |                 y.append(yi)
121 |             return torch.cat(y, 1), None  # augmented inference, train
122 |         else:
123 |             return self.forward_once(x, profile)  # single-scale inference, train
124 | 
125 |     def forward_once(self, x, profile=False):
126 |         y, dt = [], []  # outputs
127 |         for m in self.model:
128 |             if m.f != -1:  # if not from previous layer
129 |                 x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]  # from earlier layers
130 | 
131 |             if profile:
132 |                 o = thop.profile(m, inputs=(x,), verbose=False)[0] / 1E9 * 2 if thop else 0  # FLOPS
133 |                 t = time_synchronized()
134 |                 for _ in range(10):
135 |                     _ = m(x)
136 |                 dt.append((time_synchronized() - t) * 100)
137 |                 print('%10.1f%10.0f%10.1fms %-40s' % (o, m.np, dt[-1], m.type))
138 | 
139 |             x = m(x)  # run
140 |             y.append(x if m.i in self.save else None)  # save output
141 | 
142 |         if profile:
143 |             print('%.1fms total' % sum(dt))
144 |         return x
145 | 
146 |     def _initialize_biases(self, cf=None):  # initialize biases into Detect(), cf is class frequency
147 |         # https://arxiv.org/abs/1708.02002 section 3.3
148 |         # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1.
149 |         m = self.model[-1]  # Detect() module
150 |         for mi, s in zip(m.m, m.stride):  # from
151 |             b = mi.bias.view(m.na, -1)  # conv.bias(255) to (3,85)
152 |             b.data[:, 4] += math.log(8 / (640 / s) ** 2)  # obj (8 objects per 640 image)
153 |             b.data[:, 5:] += math.log(0.6 / (m.nc - 0.99)) if cf is None else torch.log(cf / cf.sum())  # cls
154 |             mi.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
155 | 
156 |     def _print_biases(self):
157 |         m = self.model[-1]  # Detect() module
158 |         for mi in m.m:  # from
159 |             b = mi.bias.detach().view(m.na, -1).T  # conv.bias(255) to (3,85)
160 |             print(('%6g Conv2d.bias:' + '%10.3g' * 6) % (mi.weight.shape[1], *b[:5].mean(1).tolist(), b[5:].mean()))
161 | 
162 |     # def _print_weights(self):
163 |     #     for m in self.model.modules():
164 |     #         if type(m) is Bottleneck:
165 |     #             print('%10.3g' % (m.w.detach().sigmoid() * 2))  # shortcut weights
166 | 
167 |     def fuse(self):  # fuse model Conv2d() + BatchNorm2d() layers
168 |         print('Fusing layers... ')
169 |         for m in self.model.modules():
170 |             if type(m) is Conv and hasattr(m, 'bn'):
171 |                 m.conv = fuse_conv_and_bn(m.conv, m.bn)  # update conv
172 |                 delattr(m, 'bn')  # remove batchnorm
173 |                 m.forward = m.fuseforward  # update forward
174 |         self.info()
175 |         return self
176 | 
177 |     def nms(self, mode=True):  # add or remove NMS module
178 |         present = type(self.model[-1]) is NMS  # last layer is NMS
179 |         if mode and not present:
180 |             print('Adding NMS... ')
181 |             m = NMS()  # module
182 |             m.f = -1  # from
183 |             m.i = self.model[-1].i + 1  # index
184 |             self.model.add_module(name='%s' % m.i, module=m)  # add
185 |             self.eval()
186 |         elif not mode and present:
187 |             print('Removing NMS... ')
188 |             self.model = self.model[:-1]  # remove
189 |         return self
190 | 
191 |     def autoshape(self):  # add autoShape module
192 |         print('Adding autoShape... ')
193 |         m = autoShape(self)  # wrap model
194 |         copy_attr(m, self, include=('yaml', 'nc', 'hyp', 'names', 'stride'), exclude=())  # copy attributes
195 |         return m
196 | 
197 |     def info(self, verbose=False, img_size=640):  # print model information
198 |         model_info(self, verbose, img_size)
199 | 
200 | 
201 | def parse_model(d, ch):  # model_dict, input_channels(3)
202 |     logger.info('\n%3s%18s%3s%10s  %-40s%-30s' % ('', 'from', 'n', 'params', 'module', 'arguments'))
203 |     anchors, nc, gd, gw = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple']
204 |     na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors  # number of anchors
205 |     no = na * (nc + 5)  # number of outputs = anchors * (classes + 5)
206 | 
207 |     layers, save, c2 = [], [], ch[-1]  # layers, savelist, ch out
208 |     for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']):  # from, number, module, args
209 |         m = eval(m) if isinstance(m, str) else m  # eval strings
210 |         for j, a in enumerate(args):
211 |             try:
212 |                 args[j] = eval(a) if isinstance(a, str) else a  # eval strings
213 |             except:
214 |                 pass
215 | 
216 |         n = max(round(n * gd), 1) if n > 1 else n  # depth gain
217 |         if m in [Conv, GhostConv, Bottleneck, GhostBottleneck, SPP, DWConv, MixConv2d, Focus, CrossConv, BottleneckCSP,
218 |                  C3, C3TR, Conv3BN, InvertedResidual]:
219 |             c1, c2 = ch[f], args[0]
220 |             if c2 != no:  # if not output
221 |                 c2 = make_divisible(c2 * gw, 8)
222 | 
223 |             args = [c1, c2, *args[1:]]
224 |             if m in [BottleneckCSP, C3, C3TR]:
225 |                 args.insert(2, n)  # number of repeats
226 |                 n = 1
227 |         elif m is nn.BatchNorm2d:
228 |             args = [ch[f]]
229 |         elif m is Concat:
230 |             c2 = sum([ch[x] for x in f])
231 |         elif m is Detect:
232 |             args.append([ch[x] for x in f])
233 |             if isinstance(args[1], int):  # number of anchors
234 |                 args[1] = [list(range(args[1] * 2))] * len(f)
235 |         elif m is Contract:
236 |             c2 = ch[f] * args[0] ** 2
237 |         elif m is Expand:
238 |             c2 = ch[f] // args[0] ** 2
239 |         else:
240 |             c2 = ch[f]
241 | 
242 |         m_ = nn.Sequential(*[m(*args) for _ in range(n)]) if n > 1 else m(*args)  # module
243 |         t = str(m)[8:-2].replace('__main__.', '')  # module type
244 |         np = sum([x.numel() for x in m_.parameters()])  # number params
245 |         m_.i, m_.f, m_.type, m_.np = i, f, t, np  # attach index, 'from' index, type, number params
246 |         logger.info('%3s%18s%3s%10.0f  %-40s%-30s' % (i, f, n, np, t, args))  # print
247 |         save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1)  # append to savelist
248 |         layers.append(m_)
249 |         if i == 0:
250 |             ch = []
251 |         ch.append(c2)
252 |     return nn.Sequential(*layers), sorted(save)
253 | 
254 | 
255 | if __name__ == '__main__':
256 |     parser = argparse.ArgumentParser()
257 |     parser.add_argument('--cfg', type=str, default='yolov5s.yaml', help='model.yaml')
258 |     parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
259 |     opt = parser.parse_args()
260 |     opt.cfg = check_file(opt.cfg)  # check file
261 |     set_logging()
262 |     device = select_device(opt.device)
263 | 
264 |     # Create model
265 |     model = Model(opt.cfg).to(device)
266 |     model.train()
267 | 
268 |     # Profile
269 |     # img = torch.rand(8 if torch.cuda.is_available() else 1, 3, 640, 640).to(device)
270 |     # y = model(img, profile=True)
271 | 
272 |     # Tensorboard
273 |     # from torch.utils.tensorboard import SummaryWriter
274 |     # tb_writer = SummaryWriter()
275 |     # print("Run 'tensorboard --logdir=models/runs' to view tensorboard at http://localhost:6006/")
276 |     # tb_writer.add_graph(model.model, img)  # add model to tensorboard
277 |     # tb_writer.add_image('test', img[0], dataformats='CWH')  # add model to tensorboard
278 | 


--------------------------------------------------------------------------------
/models/yolov5s-mv3l.yaml:
--------------------------------------------------------------------------------
 1 | # parameters
 2 | nc: 2  # number of classes
 3 | depth_multiple: 1.0  # dont change this otherwise InvertedResidual will be affected
 4 | width_multiple: 1.0  # dont change this otherwise InvertedResidual will be affected
 5 | 
 6 | # anchors
 7 | anchors:
 8 |   - [10,13, 16,30, 33,23]  # P3/8
 9 |   - [30,61, 62,45, 59,119]  # P4/16
10 |   - [116,90, 156,198, 373,326]  # P5/32
11 | 
12 | # YOLOv5 backbone
13 | backbone:
14 |   # MobileNetV3-large
15 |   # [from, number, module, args]
16 |   [[-1, 1, Conv3BN, [16, 2]],                          # 0-p1/2
17 |    [-1, 1, InvertedResidual, [ 16,  16, 3, 1, 0, 0]],  # 1-p1/2
18 |    [-1, 1, InvertedResidual, [ 24,  64, 3, 2, 0, 0]],  # 2-p2/4
19 |    [-1, 1, InvertedResidual, [ 24,  72, 3, 1, 0, 0]],  # 3-p2/4
20 |    [-1, 1, InvertedResidual, [ 40,  72, 5, 2, 1, 0]],  # 4-p3/8
21 |    [-1, 1, InvertedResidual, [ 40, 120, 5, 1, 1, 0]],  # 5-p3/8
22 |    [-1, 1, InvertedResidual, [ 40, 120, 5, 1, 1, 0]],  # 6-p3/8
23 |    [-1, 1, InvertedResidual, [ 80, 240, 3, 2, 0, 1]],  # 7-p4/16
24 |    [-1, 1, InvertedResidual, [ 80, 200, 3, 1, 0, 1]],  # 8-p4/16
25 |    [-1, 1, InvertedResidual, [ 80, 184, 3, 1, 0, 1]],  # 9-p4/16
26 |    [-1, 1, InvertedResidual, [ 80, 184, 3, 1, 0, 1]],  # 10-p4/16
27 |    [-1, 1, InvertedResidual, [112, 480, 3, 1, 1, 1]],  # 11-p4/16
28 |    [-1, 1, InvertedResidual, [112, 672, 3, 1, 1, 1]],  # 12-p4/16
29 |    [-1, 1, InvertedResidual, [160, 672, 5, 1, 1, 1]],  # 13-p4/16
30 |    [-1, 1, InvertedResidual, [160, 672, 5, 2, 1, 1]],  # 14-p5/32
31 |    [-1, 1, InvertedResidual, [160, 960, 5, 1, 1, 1]],  # 15-p5/32
32 |   ]
33 | 
34 | 
35 | # YOLOv5 head
36 | # the head is still yolov5s, but depth_multiple and width_multiple are set to 1.0, so Conv and C3 need to be changed
37 | # depth_multiple of yolov5s: 0.33, i.e., number of C3: 3 -> 1
38 | # width_multiple of yolov5s: 0.50, i.e., halve the ch_out
39 | head:
40 |   [[-1, 1, Conv, [256, 1, 1]],
41 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
42 |    [[-1, 13], 1, Concat, [1]],  # cat backbone P4
43 |    [-1, 1, C3, [256, False]],  # 19
44 | 
45 |    [-1, 1, Conv, [128, 1, 1]],
46 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
47 |    [[-1, 6], 1, Concat, [1]],  # cat backbone P3
48 |    [-1, 1, C3, [128, False]],  # 23 (P3/8-small)
49 | 
50 |    [-1, 1, Conv, [128, 3, 2]],
51 |    [[-1, 20], 1, Concat, [1]],  # cat head P4
52 |    [-1, 1, C3, [256, False]],  # 26 (P4/16-medium)
53 | 
54 |    [-1, 1, Conv, [256, 3, 2]],
55 |    [[-1, 16], 1, Concat, [1]],  # cat head P5
56 |    [-1, 1, C3, [512, False]],  # 29 (P5/32-large)
57 | 
58 |    [[23, 26, 29], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
59 |   ]


--------------------------------------------------------------------------------
/models/yolov5s-mv3s.yaml:
--------------------------------------------------------------------------------
 1 | # parameters
 2 | nc: 2  # number of classes
 3 | depth_multiple: 1.0  # dont change this otherwise InvertedResidual will be affected
 4 | width_multiple: 1.0  # dont change this otherwise InvertedResidual will be affected
 5 | 
 6 | # anchors
 7 | anchors:
 8 |   - [10,13, 16,30, 33,23]  # P3/8
 9 |   - [30,61, 62,45, 59,119]  # P4/16
10 |   - [116,90, 156,198, 373,326]  # P5/32
11 | 
12 | # YOLOv5 backbone
13 | backbone:
14 |   # MobileNetV3-small
15 |   # [from, number, module, args]
16 |   [[-1, 1, Conv3BN, [16, 2]],                         # 0-p1/2
17 |    [-1, 1, InvertedResidual, [16,  16, 3, 2, 1, 0]],  # 1-p2/4
18 |    [-1, 1, InvertedResidual, [24,  72, 3, 2, 0, 0]],  # 2-p3/8
19 |    [-1, 1, InvertedResidual, [24,  88, 3, 1, 0, 0]],  # 3-p3/8
20 |    [-1, 1, InvertedResidual, [40,  96, 5, 2, 1, 1]],  # 4-p4/16
21 |    [-1, 1, InvertedResidual, [40, 240, 5, 1, 1, 1]],  # 5-p4/16
22 |    [-1, 1, InvertedResidual, [40, 240, 5, 1, 1, 1]],  # 6-p4/16
23 |    [-1, 1, InvertedResidual, [48, 120, 5, 1, 1, 1]],  # 7-p4/16
24 |    [-1, 1, InvertedResidual, [48, 144, 5, 1, 1, 1]],  # 8-p4/16
25 |    [-1, 1, InvertedResidual, [96, 288, 5, 2, 1, 1]],  # 9-p5/32
26 |    [-1, 1, InvertedResidual, [96, 576, 5, 1, 1, 1]],  # 10-p5/32
27 |    [-1, 1, InvertedResidual, [96, 576, 5, 1, 1, 1]],  # 11-p5/32
28 |   ]
29 |         
30 | # YOLOv5 head
31 | # the head is still yolov5s, but depth_multiple and width_multiple are set to 1.0, so Conv and C3 need to be changed
32 | # depth_multiple of yolov5s: 0.33, i.e., number of C3: 3 -> 1
33 | # width_multiple of yolov5s: 0.50, i.e., halve the ch_out
34 | head:
35 |   [[-1, 1, Conv, [256, 1, 1]],
36 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
37 |    [[-1, 8], 1, Concat, [1]],  # cat backbone P4
38 |    [-1, 1, C3, [256, False]],  # 15
39 | 
40 |    [-1, 1, Conv, [128, 1, 1]],
41 |    [-1, 1, nn.Upsample, [None, 2, 'nearest']],
42 |    [[-1, 3], 1, Concat, [1]],  # cat backbone P3
43 |    [-1, 1, C3, [128, False]],  # 19 (P3/8-small)
44 | 
45 |    [-1, 1, Conv, [128, 3, 2]],
46 |    [[-1, 16], 1, Concat, [1]],  # cat head P4
47 |    [-1, 1, C3, [256, False]],  # 22 (P4/16-medium)
48 | 
49 |    [-1, 1, Conv, [256, 3, 2]],
50 |    [[-1, 12], 1, Concat, [1]],  # cat head P5
51 |    [-1, 1, C3, [512, False]],  # 25 (P5/32-large)
52 | 
53 |    [[19, 22, 25], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
54 |   ]


--------------------------------------------------------------------------------