├── README.md └── models ├── common.py ├── yolo.py ├── yolov5s-mv3l.yaml └── yolov5s-mv3s.yaml /README.md: -------------------------------------------------------------------------------- 1 | # yolov5-mobilenetv3 2 | 3 | compatible yolov5: https://github.com/ultralytics/yolov5/tree/b5de52c4cdfefb3c7acfbff7d7f450a46b4aaada 4 | 5 | mobilenetv3: https://github.com/chufei1995/mobilenetv3.pytorch 6 | 7 | reference: https://github.com/Syencil/mobile-yolov5-pruning-distillation 8 | -------------------------------------------------------------------------------- /models/common.py: -------------------------------------------------------------------------------- 1 | # YOLOv5 common modules 2 | 3 | import math 4 | from copy import copy 5 | from pathlib import Path 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import requests 10 | import torch 11 | import torch.nn as nn 12 | from PIL import Image 13 | from torch.cuda import amp 14 | 15 | from utils.datasets import letterbox 16 | from utils.general import non_max_suppression, make_divisible, scale_coords, increment_path, xyxy2xywh 17 | from utils.plots import color_list, plot_one_box 18 | from utils.torch_utils import time_synchronized 19 | 20 | 21 | def autopad(k, p=None): # kernel, padding 22 | # Pad to 'same' 23 | if p is None: 24 | p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad 25 | return p 26 | 27 | 28 | def DWConv(c1, c2, k=1, s=1, act=True): 29 | # Depthwise convolution 30 | return Conv(c1, c2, k, s, g=math.gcd(c1, c2), act=act) 31 | 32 | 33 | class Conv(nn.Module): 34 | # Standard convolution 35 | def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups 36 | super(Conv, self).__init__() 37 | self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False) 38 | self.bn = nn.BatchNorm2d(c2) 39 | self.act = nn.SiLU() if act is True else (act if isinstance(act, nn.Module) else nn.Identity()) 40 | 41 | def forward(self, x): 42 | return self.act(self.bn(self.conv(x))) 43 | 44 | def fuseforward(self, x): 45 | return self.act(self.conv(x)) 46 | 47 | 48 | class TransformerLayer(nn.Module): 49 | # Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance) 50 | def __init__(self, c, num_heads): 51 | super().__init__() 52 | self.q = nn.Linear(c, c, bias=False) 53 | self.k = nn.Linear(c, c, bias=False) 54 | self.v = nn.Linear(c, c, bias=False) 55 | self.ma = nn.MultiheadAttention(embed_dim=c, num_heads=num_heads) 56 | self.fc1 = nn.Linear(c, c, bias=False) 57 | self.fc2 = nn.Linear(c, c, bias=False) 58 | 59 | def forward(self, x): 60 | x = self.ma(self.q(x), self.k(x), self.v(x))[0] + x 61 | x = self.fc2(self.fc1(x)) + x 62 | return x 63 | 64 | 65 | class TransformerBlock(nn.Module): 66 | # Vision Transformer https://arxiv.org/abs/2010.11929 67 | def __init__(self, c1, c2, num_heads, num_layers): 68 | super().__init__() 69 | self.conv = None 70 | if c1 != c2: 71 | self.conv = Conv(c1, c2) 72 | self.linear = nn.Linear(c2, c2) # learnable position embedding 73 | self.tr = nn.Sequential(*[TransformerLayer(c2, num_heads) for _ in range(num_layers)]) 74 | self.c2 = c2 75 | 76 | def forward(self, x): 77 | if self.conv is not None: 78 | x = self.conv(x) 79 | b, _, w, h = x.shape 80 | p = x.flatten(2) 81 | p = p.unsqueeze(0) 82 | p = p.transpose(0, 3) 83 | p = p.squeeze(3) 84 | e = self.linear(p) 85 | x = p + e 86 | 87 | x = self.tr(x) 88 | x = x.unsqueeze(3) 89 | x = x.transpose(0, 3) 90 | x = x.reshape(b, self.c2, w, h) 91 | return x 92 | 93 | 94 | class Bottleneck(nn.Module): 95 | # Standard bottleneck 96 | def __init__(self, c1, c2, shortcut=True, g=1, e=0.5): # ch_in, ch_out, shortcut, groups, expansion 97 | super(Bottleneck, self).__init__() 98 | c_ = int(c2 * e) # hidden channels 99 | self.cv1 = Conv(c1, c_, 1, 1) 100 | self.cv2 = Conv(c_, c2, 3, 1, g=g) 101 | self.add = shortcut and c1 == c2 102 | 103 | def forward(self, x): 104 | return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) 105 | 106 | 107 | class BottleneckCSP(nn.Module): 108 | # CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks 109 | def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion 110 | super(BottleneckCSP, self).__init__() 111 | c_ = int(c2 * e) # hidden channels 112 | self.cv1 = Conv(c1, c_, 1, 1) 113 | self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False) 114 | self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False) 115 | self.cv4 = Conv(2 * c_, c2, 1, 1) 116 | self.bn = nn.BatchNorm2d(2 * c_) # applied to cat(cv2, cv3) 117 | self.act = nn.LeakyReLU(0.1, inplace=True) 118 | self.m = nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)]) 119 | 120 | def forward(self, x): 121 | y1 = self.cv3(self.m(self.cv1(x))) 122 | y2 = self.cv2(x) 123 | return self.cv4(self.act(self.bn(torch.cat((y1, y2), dim=1)))) 124 | 125 | 126 | class C3(nn.Module): 127 | # CSP Bottleneck with 3 convolutions 128 | def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion 129 | super(C3, self).__init__() 130 | c_ = int(c2 * e) # hidden channels 131 | self.cv1 = Conv(c1, c_, 1, 1) 132 | self.cv2 = Conv(c1, c_, 1, 1) 133 | self.cv3 = Conv(2 * c_, c2, 1) # act=FReLU(c2) 134 | self.m = nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)]) 135 | # self.m = nn.Sequential(*[CrossConv(c_, c_, 3, 1, g, 1.0, shortcut) for _ in range(n)]) 136 | 137 | def forward(self, x): 138 | return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1)) 139 | 140 | 141 | class C3TR(C3): 142 | # C3 module with TransformerBlock() 143 | def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): 144 | super().__init__(c1, c2, n, shortcut, g, e) 145 | c_ = int(c2 * e) 146 | self.m = TransformerBlock(c_, c_, 4, n) 147 | 148 | 149 | class SPP(nn.Module): 150 | # Spatial pyramid pooling layer used in YOLOv3-SPP 151 | def __init__(self, c1, c2, k=(5, 9, 13)): 152 | super(SPP, self).__init__() 153 | c_ = c1 // 2 # hidden channels 154 | self.cv1 = Conv(c1, c_, 1, 1) 155 | self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1) 156 | self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k]) 157 | 158 | def forward(self, x): 159 | x = self.cv1(x) 160 | return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1)) 161 | 162 | 163 | class Focus(nn.Module): 164 | # Focus wh information into c-space 165 | def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups 166 | super(Focus, self).__init__() 167 | self.conv = Conv(c1 * 4, c2, k, s, p, g, act) 168 | # self.contract = Contract(gain=2) 169 | 170 | def forward(self, x): # x(b,c,w,h) -> y(b,4c,w/2,h/2) 171 | return self.conv(torch.cat([x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]], 1)) 172 | # return self.conv(self.contract(x)) 173 | 174 | 175 | class Contract(nn.Module): 176 | # Contract width-height into channels, i.e. x(1,64,80,80) to x(1,256,40,40) 177 | def __init__(self, gain=2): 178 | super().__init__() 179 | self.gain = gain 180 | 181 | def forward(self, x): 182 | N, C, H, W = x.size() # assert (H / s == 0) and (W / s == 0), 'Indivisible gain' 183 | s = self.gain 184 | x = x.view(N, C, H // s, s, W // s, s) # x(1,64,40,2,40,2) 185 | x = x.permute(0, 3, 5, 1, 2, 4).contiguous() # x(1,2,2,64,40,40) 186 | return x.view(N, C * s * s, H // s, W // s) # x(1,256,40,40) 187 | 188 | 189 | class Expand(nn.Module): 190 | # Expand channels into width-height, i.e. x(1,64,80,80) to x(1,16,160,160) 191 | def __init__(self, gain=2): 192 | super().__init__() 193 | self.gain = gain 194 | 195 | def forward(self, x): 196 | N, C, H, W = x.size() # assert C / s ** 2 == 0, 'Indivisible gain' 197 | s = self.gain 198 | x = x.view(N, s, s, C // s ** 2, H, W) # x(1,2,2,16,80,80) 199 | x = x.permute(0, 3, 4, 1, 5, 2).contiguous() # x(1,16,80,2,80,2) 200 | return x.view(N, C // s ** 2, H * s, W * s) # x(1,16,160,160) 201 | 202 | 203 | class Concat(nn.Module): 204 | # Concatenate a list of tensors along dimension 205 | def __init__(self, dimension=1): 206 | super(Concat, self).__init__() 207 | self.d = dimension 208 | 209 | def forward(self, x): 210 | return torch.cat(x, self.d) 211 | 212 | 213 | class NMS(nn.Module): 214 | # Non-Maximum Suppression (NMS) module 215 | conf = 0.25 # confidence threshold 216 | iou = 0.45 # IoU threshold 217 | classes = None # (optional list) filter by class 218 | 219 | def __init__(self): 220 | super(NMS, self).__init__() 221 | 222 | def forward(self, x): 223 | return non_max_suppression(x[0], conf_thres=self.conf, iou_thres=self.iou, classes=self.classes) 224 | 225 | 226 | class autoShape(nn.Module): 227 | # input-robust model wrapper for passing cv2/np/PIL/torch inputs. Includes preprocessing, inference and NMS 228 | conf = 0.25 # NMS confidence threshold 229 | iou = 0.45 # NMS IoU threshold 230 | classes = None # (optional list) filter by class 231 | 232 | def __init__(self, model): 233 | super(autoShape, self).__init__() 234 | self.model = model.eval() 235 | 236 | def autoshape(self): 237 | print('autoShape already enabled, skipping... ') # model already converted to model.autoshape() 238 | return self 239 | 240 | @torch.no_grad() 241 | def forward(self, imgs, size=640, augment=False, profile=False): 242 | # Inference from various sources. For height=640, width=1280, RGB images example inputs are: 243 | # filename: imgs = 'data/samples/zidane.jpg' 244 | # URI: = 'https://github.com/ultralytics/yolov5/releases/download/v1.0/zidane.jpg' 245 | # OpenCV: = cv2.imread('image.jpg')[:,:,::-1] # HWC BGR to RGB x(640,1280,3) 246 | # PIL: = Image.open('image.jpg') # HWC x(640,1280,3) 247 | # numpy: = np.zeros((640,1280,3)) # HWC 248 | # torch: = torch.zeros(16,3,320,640) # BCHW (scaled to size=640, 0-1 values) 249 | # multiple: = [Image.open('image1.jpg'), Image.open('image2.jpg'), ...] # list of images 250 | 251 | t = [time_synchronized()] 252 | p = next(self.model.parameters()) # for device and type 253 | if isinstance(imgs, torch.Tensor): # torch 254 | with amp.autocast(enabled=p.device.type != 'cpu'): 255 | return self.model(imgs.to(p.device).type_as(p), augment, profile) # inference 256 | 257 | # Pre-process 258 | n, imgs = (len(imgs), imgs) if isinstance(imgs, list) else (1, [imgs]) # number of images, list of images 259 | shape0, shape1, files = [], [], [] # image and inference shapes, filenames 260 | for i, im in enumerate(imgs): 261 | f = f'image{i}' # filename 262 | if isinstance(im, str): # filename or uri 263 | im, f = np.asarray(Image.open(requests.get(im, stream=True).raw if im.startswith('http') else im)), im 264 | elif isinstance(im, Image.Image): # PIL Image 265 | im, f = np.asarray(im), getattr(im, 'filename', f) or f 266 | files.append(Path(f).with_suffix('.jpg').name) 267 | if im.shape[0] < 5: # image in CHW 268 | im = im.transpose((1, 2, 0)) # reverse dataloader .transpose(2, 0, 1) 269 | im = im[:, :, :3] if im.ndim == 3 else np.tile(im[:, :, None], 3) # enforce 3ch input 270 | s = im.shape[:2] # HWC 271 | shape0.append(s) # image shape 272 | g = (size / max(s)) # gain 273 | shape1.append([y * g for y in s]) 274 | imgs[i] = im # update 275 | shape1 = [make_divisible(x, int(self.stride.max())) for x in np.stack(shape1, 0).max(0)] # inference shape 276 | x = [letterbox(im, new_shape=shape1, auto=False)[0] for im in imgs] # pad 277 | x = np.stack(x, 0) if n > 1 else x[0][None] # stack 278 | x = np.ascontiguousarray(x.transpose((0, 3, 1, 2))) # BHWC to BCHW 279 | x = torch.from_numpy(x).to(p.device).type_as(p) / 255. # uint8 to fp16/32 280 | t.append(time_synchronized()) 281 | 282 | with amp.autocast(enabled=p.device.type != 'cpu'): 283 | # Inference 284 | y = self.model(x, augment, profile)[0] # forward 285 | t.append(time_synchronized()) 286 | 287 | # Post-process 288 | y = non_max_suppression(y, conf_thres=self.conf, iou_thres=self.iou, classes=self.classes) # NMS 289 | for i in range(n): 290 | scale_coords(shape1, y[i][:, :4], shape0[i]) 291 | 292 | t.append(time_synchronized()) 293 | return Detections(imgs, y, files, t, self.names, x.shape) 294 | 295 | 296 | class Detections: 297 | # detections class for YOLOv5 inference results 298 | def __init__(self, imgs, pred, files, times=None, names=None, shape=None): 299 | super(Detections, self).__init__() 300 | d = pred[0].device # device 301 | gn = [torch.tensor([*[im.shape[i] for i in [1, 0, 1, 0]], 1., 1.], device=d) for im in imgs] # normalizations 302 | self.imgs = imgs # list of images as numpy arrays 303 | self.pred = pred # list of tensors pred[0] = (xyxy, conf, cls) 304 | self.names = names # class names 305 | self.files = files # image filenames 306 | self.xyxy = pred # xyxy pixels 307 | self.xywh = [xyxy2xywh(x) for x in pred] # xywh pixels 308 | self.xyxyn = [x / g for x, g in zip(self.xyxy, gn)] # xyxy normalized 309 | self.xywhn = [x / g for x, g in zip(self.xywh, gn)] # xywh normalized 310 | self.n = len(self.pred) # number of images (batch size) 311 | self.t = tuple((times[i + 1] - times[i]) * 1000 / self.n for i in range(3)) # timestamps (ms) 312 | self.s = shape # inference BCHW shape 313 | 314 | def display(self, pprint=False, show=False, save=False, render=False, save_dir=''): 315 | colors = color_list() 316 | for i, (img, pred) in enumerate(zip(self.imgs, self.pred)): 317 | str = f'image {i + 1}/{len(self.pred)}: {img.shape[0]}x{img.shape[1]} ' 318 | if pred is not None: 319 | for c in pred[:, -1].unique(): 320 | n = (pred[:, -1] == c).sum() # detections per class 321 | str += f"{n} {self.names[int(c)]}{'s' * (n > 1)}, " # add to string 322 | if show or save or render: 323 | for *box, conf, cls in pred: # xyxy, confidence, class 324 | label = f'{self.names[int(cls)]} {conf:.2f}' 325 | plot_one_box(box, img, label=label, color=colors[int(cls) % 10]) 326 | img = Image.fromarray(img.astype(np.uint8)) if isinstance(img, np.ndarray) else img # from np 327 | if pprint: 328 | print(str.rstrip(', ')) 329 | if show: 330 | img.show(self.files[i]) # show 331 | if save: 332 | f = self.files[i] 333 | img.save(Path(save_dir) / f) # save 334 | print(f"{'Saved' * (i == 0)} {f}", end=',' if i < self.n - 1 else f' to {save_dir}\n') 335 | if render: 336 | self.imgs[i] = np.asarray(img) 337 | 338 | def print(self): 339 | self.display(pprint=True) # print results 340 | print(f'Speed: %.1fms pre-process, %.1fms inference, %.1fms NMS per image at shape {tuple(self.s)}' % self.t) 341 | 342 | def show(self): 343 | self.display(show=True) # show results 344 | 345 | def save(self, save_dir='runs/hub/exp'): 346 | save_dir = increment_path(save_dir, exist_ok=save_dir != 'runs/hub/exp') # increment save_dir 347 | Path(save_dir).mkdir(parents=True, exist_ok=True) 348 | self.display(save=True, save_dir=save_dir) # save results 349 | 350 | def render(self): 351 | self.display(render=True) # render results 352 | return self.imgs 353 | 354 | def pandas(self): 355 | # return detections as pandas DataFrames, i.e. print(results.pandas().xyxy[0]) 356 | new = copy(self) # return copy 357 | ca = 'xmin', 'ymin', 'xmax', 'ymax', 'confidence', 'class', 'name' # xyxy columns 358 | cb = 'xcenter', 'ycenter', 'width', 'height', 'confidence', 'class', 'name' # xywh columns 359 | for k, c in zip(['xyxy', 'xyxyn', 'xywh', 'xywhn'], [ca, ca, cb, cb]): 360 | a = [[x[:5] + [int(x[5]), self.names[int(x[5])]] for x in x.tolist()] for x in getattr(self, k)] # update 361 | setattr(new, k, [pd.DataFrame(x, columns=c) for x in a]) 362 | return new 363 | 364 | def tolist(self): 365 | # return a list of Detections objects, i.e. 'for result in results.tolist():' 366 | x = [Detections([self.imgs[i]], [self.pred[i]], self.names, self.s) for i in range(self.n)] 367 | for d in x: 368 | for k in ['imgs', 'pred', 'xyxy', 'xyxyn', 'xywh', 'xywhn']: 369 | setattr(d, k, getattr(d, k)[0]) # pop out of list 370 | return x 371 | 372 | def __len__(self): 373 | return self.n 374 | 375 | 376 | class Classify(nn.Module): 377 | # Classification head, i.e. x(b,c1,20,20) to x(b,c2) 378 | def __init__(self, c1, c2, k=1, s=1, p=None, g=1): # ch_in, ch_out, kernel, stride, padding, groups 379 | super(Classify, self).__init__() 380 | self.aap = nn.AdaptiveAvgPool2d(1) # to x(b,c1,1,1) 381 | self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g) # to x(b,c2,1,1) 382 | self.flat = nn.Flatten() 383 | 384 | def forward(self, x): 385 | z = torch.cat([self.aap(y) for y in (x if isinstance(x, list) else [x])], 1) # cat if list 386 | return self.flat(self.conv(z)) # flatten to x(b,c2) 387 | 388 | 389 | ### mobilenetv3 ### 390 | 391 | class h_sigmoid(nn.Module): 392 | def __init__(self, inplace=True): 393 | super(h_sigmoid, self).__init__() 394 | self.relu = nn.ReLU6(inplace=inplace) 395 | 396 | def forward(self, x): 397 | return self.relu(x + 3) / 6 398 | 399 | 400 | class h_swish(nn.Module): 401 | def __init__(self, inplace=True): 402 | super(h_swish, self).__init__() 403 | self.sigmoid = h_sigmoid(inplace=inplace) 404 | 405 | def forward(self, x): 406 | y = self.sigmoid(x) 407 | return x * y 408 | 409 | 410 | class SELayer(nn.Module): 411 | def __init__(self, channel, reduction=4): 412 | super(SELayer, self).__init__() 413 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 414 | self.fc = nn.Sequential( 415 | nn.Linear(channel, channel // reduction), 416 | nn.ReLU(inplace=True), 417 | nn.Linear(channel // reduction, channel), 418 | h_sigmoid() 419 | ) 420 | 421 | def forward(self, x): 422 | b, c, _, _ = x.size() 423 | y = self.avg_pool(x) 424 | y = y.view(b, c) 425 | y = self.fc(y).view(b, c, 1, 1) 426 | return x * y 427 | 428 | 429 | class Conv3BN(nn.Module): 430 | """ 431 | This equals to 432 | def conv_3x3_bn(inp, oup, stride): 433 | return nn.Sequential( 434 | nn.Conv2d(inp, oup, 3, stride, 1, bias=False), 435 | nn.BatchNorm2d(oup), 436 | h_swish() 437 | ) 438 | """ 439 | def __init__(self, inp, oup, stride): 440 | super(Conv3BN, self).__init__() 441 | self.conv = nn.Conv2d(inp, oup, 3, stride, 1, bias=False) 442 | self.bn = nn.BatchNorm2d(oup) 443 | self.act = h_swish() 444 | 445 | def forward(self, x): 446 | return self.act(self.bn(self.conv(x))) 447 | 448 | def fuseforward(self, x): 449 | return self.act(self.conv(x)) 450 | 451 | 452 | class InvertedResidual(nn.Module): 453 | def __init__(self, inp, oup, hidden_dim, kernel_size, stride, use_se, use_hs): 454 | super(InvertedResidual, self).__init__() 455 | assert stride in [1, 2] 456 | 457 | self.identity = stride == 1 and inp == oup 458 | 459 | if inp == hidden_dim: 460 | self.conv = nn.Sequential( 461 | # dw 462 | nn.Conv2d(hidden_dim, hidden_dim, kernel_size, stride, (kernel_size - 1) // 2, groups=hidden_dim, bias=False), 463 | nn.BatchNorm2d(hidden_dim), 464 | h_swish() if use_hs else nn.ReLU(inplace=True), 465 | # Squeeze-and-Excite 466 | SELayer(hidden_dim) if use_se else nn.Sequential(), 467 | # pw-linear 468 | nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), 469 | nn.BatchNorm2d(oup), 470 | ) 471 | else: 472 | self.conv = nn.Sequential( 473 | # pw 474 | nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False), 475 | nn.BatchNorm2d(hidden_dim), 476 | h_swish() if use_hs else nn.ReLU(inplace=True), 477 | # dw 478 | nn.Conv2d(hidden_dim, hidden_dim, kernel_size, stride, (kernel_size - 1) // 2, groups=hidden_dim, bias=False), 479 | nn.BatchNorm2d(hidden_dim), 480 | # Squeeze-and-Excite 481 | SELayer(hidden_dim) if use_se else nn.Sequential(), 482 | h_swish() if use_hs else nn.ReLU(inplace=True), 483 | # pw-linear 484 | nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), 485 | nn.BatchNorm2d(oup), 486 | ) 487 | 488 | def forward(self, x): 489 | y = self.conv(x) 490 | if self.identity: 491 | return x + y 492 | else: 493 | return y 494 | -------------------------------------------------------------------------------- /models/yolo.py: -------------------------------------------------------------------------------- 1 | # YOLOv5 YOLO-specific modules 2 | 3 | import argparse 4 | import logging 5 | import sys 6 | from copy import deepcopy 7 | 8 | sys.path.append('./') # to run '$ python *.py' files in subdirectories 9 | logger = logging.getLogger(__name__) 10 | 11 | from models.common import * 12 | from models.experimental import * 13 | from utils.autoanchor import check_anchor_order 14 | from utils.general import make_divisible, check_file, set_logging 15 | from utils.torch_utils import time_synchronized, fuse_conv_and_bn, model_info, scale_img, initialize_weights, \ 16 | select_device, copy_attr 17 | 18 | try: 19 | import thop # for FLOPS computation 20 | except ImportError: 21 | thop = None 22 | 23 | 24 | class Detect(nn.Module): 25 | stride = None # strides computed during build 26 | export = False # onnx export 27 | 28 | def __init__(self, nc=80, anchors=(), ch=()): # detection layer 29 | super(Detect, self).__init__() 30 | self.nc = nc # number of classes 31 | self.no = nc + 5 # number of outputs per anchor 32 | self.nl = len(anchors) # number of detection layers 33 | self.na = len(anchors[0]) // 2 # number of anchors 34 | self.grid = [torch.zeros(1)] * self.nl # init grid 35 | a = torch.tensor(anchors).float().view(self.nl, -1, 2) 36 | self.register_buffer('anchors', a) # shape(nl,na,2) 37 | self.register_buffer('anchor_grid', a.clone().view(self.nl, 1, -1, 1, 1, 2)) # shape(nl,1,na,1,1,2) 38 | self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch) # output conv 39 | 40 | def forward(self, x): 41 | # x = x.copy() # for profiling 42 | z = [] # inference output 43 | self.training |= self.export 44 | for i in range(self.nl): 45 | x[i] = self.m[i](x[i]) # conv 46 | bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85) 47 | x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous() 48 | 49 | if not self.training: # inference 50 | if self.grid[i].shape[2:4] != x[i].shape[2:4]: 51 | self.grid[i] = self._make_grid(nx, ny).to(x[i].device) 52 | 53 | y = x[i].sigmoid() 54 | y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i] # xy 55 | y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh 56 | z.append(y.view(bs, -1, self.no)) 57 | 58 | return x if self.training else (torch.cat(z, 1), x) 59 | 60 | @staticmethod 61 | def _make_grid(nx=20, ny=20): 62 | yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)]) 63 | return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float() 64 | 65 | 66 | class Model(nn.Module): 67 | def __init__(self, cfg='yolov5s.yaml', ch=3, nc=None, anchors=None): # model, input channels, number of classes 68 | super(Model, self).__init__() 69 | if isinstance(cfg, dict): 70 | self.yaml = cfg # model dict 71 | else: # is *.yaml 72 | import yaml # for torch hub 73 | self.yaml_file = Path(cfg).name 74 | with open(cfg) as f: 75 | self.yaml = yaml.load(f, Loader=yaml.SafeLoader) # model dict 76 | 77 | # Define model 78 | ch = self.yaml['ch'] = self.yaml.get('ch', ch) # input channels 79 | if nc and nc != self.yaml['nc']: 80 | logger.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}") 81 | self.yaml['nc'] = nc # override yaml value 82 | if anchors: 83 | logger.info(f'Overriding model.yaml anchors with anchors={anchors}') 84 | self.yaml['anchors'] = round(anchors) # override yaml value 85 | self.model, self.save = parse_model(deepcopy(self.yaml), ch=[ch]) # model, savelist 86 | self.names = [str(i) for i in range(self.yaml['nc'])] # default names 87 | # print([x.shape for x in self.forward(torch.zeros(1, ch, 64, 64))]) 88 | 89 | # Build strides, anchors 90 | m = self.model[-1] # Detect() 91 | if isinstance(m, Detect): 92 | s = 256 # 2x min stride 93 | m.stride = torch.tensor([s / x.shape[-2] for x in self.forward(torch.zeros(1, ch, s, s))]) # forward 94 | m.anchors /= m.stride.view(-1, 1, 1) 95 | check_anchor_order(m) 96 | self.stride = m.stride 97 | self._initialize_biases() # only run once 98 | # print('Strides: %s' % m.stride.tolist()) 99 | 100 | # Init weights, biases 101 | initialize_weights(self) 102 | self.info() 103 | logger.info('') 104 | 105 | def forward(self, x, augment=False, profile=False): 106 | if augment: 107 | img_size = x.shape[-2:] # height, width 108 | s = [1, 0.83, 0.67] # scales 109 | f = [None, 3, None] # flips (2-ud, 3-lr) 110 | y = [] # outputs 111 | for si, fi in zip(s, f): 112 | xi = scale_img(x.flip(fi) if fi else x, si, gs=int(self.stride.max())) 113 | yi = self.forward_once(xi)[0] # forward 114 | # cv2.imwrite(f'img_{si}.jpg', 255 * xi[0].cpu().numpy().transpose((1, 2, 0))[:, :, ::-1]) # save 115 | yi[..., :4] /= si # de-scale 116 | if fi == 2: 117 | yi[..., 1] = img_size[0] - yi[..., 1] # de-flip ud 118 | elif fi == 3: 119 | yi[..., 0] = img_size[1] - yi[..., 0] # de-flip lr 120 | y.append(yi) 121 | return torch.cat(y, 1), None # augmented inference, train 122 | else: 123 | return self.forward_once(x, profile) # single-scale inference, train 124 | 125 | def forward_once(self, x, profile=False): 126 | y, dt = [], [] # outputs 127 | for m in self.model: 128 | if m.f != -1: # if not from previous layer 129 | x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f] # from earlier layers 130 | 131 | if profile: 132 | o = thop.profile(m, inputs=(x,), verbose=False)[0] / 1E9 * 2 if thop else 0 # FLOPS 133 | t = time_synchronized() 134 | for _ in range(10): 135 | _ = m(x) 136 | dt.append((time_synchronized() - t) * 100) 137 | print('%10.1f%10.0f%10.1fms %-40s' % (o, m.np, dt[-1], m.type)) 138 | 139 | x = m(x) # run 140 | y.append(x if m.i in self.save else None) # save output 141 | 142 | if profile: 143 | print('%.1fms total' % sum(dt)) 144 | return x 145 | 146 | def _initialize_biases(self, cf=None): # initialize biases into Detect(), cf is class frequency 147 | # https://arxiv.org/abs/1708.02002 section 3.3 148 | # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1. 149 | m = self.model[-1] # Detect() module 150 | for mi, s in zip(m.m, m.stride): # from 151 | b = mi.bias.view(m.na, -1) # conv.bias(255) to (3,85) 152 | b.data[:, 4] += math.log(8 / (640 / s) ** 2) # obj (8 objects per 640 image) 153 | b.data[:, 5:] += math.log(0.6 / (m.nc - 0.99)) if cf is None else torch.log(cf / cf.sum()) # cls 154 | mi.bias = torch.nn.Parameter(b.view(-1), requires_grad=True) 155 | 156 | def _print_biases(self): 157 | m = self.model[-1] # Detect() module 158 | for mi in m.m: # from 159 | b = mi.bias.detach().view(m.na, -1).T # conv.bias(255) to (3,85) 160 | print(('%6g Conv2d.bias:' + '%10.3g' * 6) % (mi.weight.shape[1], *b[:5].mean(1).tolist(), b[5:].mean())) 161 | 162 | # def _print_weights(self): 163 | # for m in self.model.modules(): 164 | # if type(m) is Bottleneck: 165 | # print('%10.3g' % (m.w.detach().sigmoid() * 2)) # shortcut weights 166 | 167 | def fuse(self): # fuse model Conv2d() + BatchNorm2d() layers 168 | print('Fusing layers... ') 169 | for m in self.model.modules(): 170 | if type(m) is Conv and hasattr(m, 'bn'): 171 | m.conv = fuse_conv_and_bn(m.conv, m.bn) # update conv 172 | delattr(m, 'bn') # remove batchnorm 173 | m.forward = m.fuseforward # update forward 174 | self.info() 175 | return self 176 | 177 | def nms(self, mode=True): # add or remove NMS module 178 | present = type(self.model[-1]) is NMS # last layer is NMS 179 | if mode and not present: 180 | print('Adding NMS... ') 181 | m = NMS() # module 182 | m.f = -1 # from 183 | m.i = self.model[-1].i + 1 # index 184 | self.model.add_module(name='%s' % m.i, module=m) # add 185 | self.eval() 186 | elif not mode and present: 187 | print('Removing NMS... ') 188 | self.model = self.model[:-1] # remove 189 | return self 190 | 191 | def autoshape(self): # add autoShape module 192 | print('Adding autoShape... ') 193 | m = autoShape(self) # wrap model 194 | copy_attr(m, self, include=('yaml', 'nc', 'hyp', 'names', 'stride'), exclude=()) # copy attributes 195 | return m 196 | 197 | def info(self, verbose=False, img_size=640): # print model information 198 | model_info(self, verbose, img_size) 199 | 200 | 201 | def parse_model(d, ch): # model_dict, input_channels(3) 202 | logger.info('\n%3s%18s%3s%10s %-40s%-30s' % ('', 'from', 'n', 'params', 'module', 'arguments')) 203 | anchors, nc, gd, gw = d['anchors'], d['nc'], d['depth_multiple'], d['width_multiple'] 204 | na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors # number of anchors 205 | no = na * (nc + 5) # number of outputs = anchors * (classes + 5) 206 | 207 | layers, save, c2 = [], [], ch[-1] # layers, savelist, ch out 208 | for i, (f, n, m, args) in enumerate(d['backbone'] + d['head']): # from, number, module, args 209 | m = eval(m) if isinstance(m, str) else m # eval strings 210 | for j, a in enumerate(args): 211 | try: 212 | args[j] = eval(a) if isinstance(a, str) else a # eval strings 213 | except: 214 | pass 215 | 216 | n = max(round(n * gd), 1) if n > 1 else n # depth gain 217 | if m in [Conv, GhostConv, Bottleneck, GhostBottleneck, SPP, DWConv, MixConv2d, Focus, CrossConv, BottleneckCSP, 218 | C3, C3TR, Conv3BN, InvertedResidual]: 219 | c1, c2 = ch[f], args[0] 220 | if c2 != no: # if not output 221 | c2 = make_divisible(c2 * gw, 8) 222 | 223 | args = [c1, c2, *args[1:]] 224 | if m in [BottleneckCSP, C3, C3TR]: 225 | args.insert(2, n) # number of repeats 226 | n = 1 227 | elif m is nn.BatchNorm2d: 228 | args = [ch[f]] 229 | elif m is Concat: 230 | c2 = sum([ch[x] for x in f]) 231 | elif m is Detect: 232 | args.append([ch[x] for x in f]) 233 | if isinstance(args[1], int): # number of anchors 234 | args[1] = [list(range(args[1] * 2))] * len(f) 235 | elif m is Contract: 236 | c2 = ch[f] * args[0] ** 2 237 | elif m is Expand: 238 | c2 = ch[f] // args[0] ** 2 239 | else: 240 | c2 = ch[f] 241 | 242 | m_ = nn.Sequential(*[m(*args) for _ in range(n)]) if n > 1 else m(*args) # module 243 | t = str(m)[8:-2].replace('__main__.', '') # module type 244 | np = sum([x.numel() for x in m_.parameters()]) # number params 245 | m_.i, m_.f, m_.type, m_.np = i, f, t, np # attach index, 'from' index, type, number params 246 | logger.info('%3s%18s%3s%10.0f %-40s%-30s' % (i, f, n, np, t, args)) # print 247 | save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1) # append to savelist 248 | layers.append(m_) 249 | if i == 0: 250 | ch = [] 251 | ch.append(c2) 252 | return nn.Sequential(*layers), sorted(save) 253 | 254 | 255 | if __name__ == '__main__': 256 | parser = argparse.ArgumentParser() 257 | parser.add_argument('--cfg', type=str, default='yolov5s.yaml', help='model.yaml') 258 | parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') 259 | opt = parser.parse_args() 260 | opt.cfg = check_file(opt.cfg) # check file 261 | set_logging() 262 | device = select_device(opt.device) 263 | 264 | # Create model 265 | model = Model(opt.cfg).to(device) 266 | model.train() 267 | 268 | # Profile 269 | # img = torch.rand(8 if torch.cuda.is_available() else 1, 3, 640, 640).to(device) 270 | # y = model(img, profile=True) 271 | 272 | # Tensorboard 273 | # from torch.utils.tensorboard import SummaryWriter 274 | # tb_writer = SummaryWriter() 275 | # print("Run 'tensorboard --logdir=models/runs' to view tensorboard at http://localhost:6006/") 276 | # tb_writer.add_graph(model.model, img) # add model to tensorboard 277 | # tb_writer.add_image('test', img[0], dataformats='CWH') # add model to tensorboard 278 | -------------------------------------------------------------------------------- /models/yolov5s-mv3l.yaml: -------------------------------------------------------------------------------- 1 | # parameters 2 | nc: 2 # number of classes 3 | depth_multiple: 1.0 # dont change this otherwise InvertedResidual will be affected 4 | width_multiple: 1.0 # dont change this otherwise InvertedResidual will be affected 5 | 6 | # anchors 7 | anchors: 8 | - [10,13, 16,30, 33,23] # P3/8 9 | - [30,61, 62,45, 59,119] # P4/16 10 | - [116,90, 156,198, 373,326] # P5/32 11 | 12 | # YOLOv5 backbone 13 | backbone: 14 | # MobileNetV3-large 15 | # [from, number, module, args] 16 | [[-1, 1, Conv3BN, [16, 2]], # 0-p1/2 17 | [-1, 1, InvertedResidual, [ 16, 16, 3, 1, 0, 0]], # 1-p1/2 18 | [-1, 1, InvertedResidual, [ 24, 64, 3, 2, 0, 0]], # 2-p2/4 19 | [-1, 1, InvertedResidual, [ 24, 72, 3, 1, 0, 0]], # 3-p2/4 20 | [-1, 1, InvertedResidual, [ 40, 72, 5, 2, 1, 0]], # 4-p3/8 21 | [-1, 1, InvertedResidual, [ 40, 120, 5, 1, 1, 0]], # 5-p3/8 22 | [-1, 1, InvertedResidual, [ 40, 120, 5, 1, 1, 0]], # 6-p3/8 23 | [-1, 1, InvertedResidual, [ 80, 240, 3, 2, 0, 1]], # 7-p4/16 24 | [-1, 1, InvertedResidual, [ 80, 200, 3, 1, 0, 1]], # 8-p4/16 25 | [-1, 1, InvertedResidual, [ 80, 184, 3, 1, 0, 1]], # 9-p4/16 26 | [-1, 1, InvertedResidual, [ 80, 184, 3, 1, 0, 1]], # 10-p4/16 27 | [-1, 1, InvertedResidual, [112, 480, 3, 1, 1, 1]], # 11-p4/16 28 | [-1, 1, InvertedResidual, [112, 672, 3, 1, 1, 1]], # 12-p4/16 29 | [-1, 1, InvertedResidual, [160, 672, 5, 1, 1, 1]], # 13-p4/16 30 | [-1, 1, InvertedResidual, [160, 672, 5, 2, 1, 1]], # 14-p5/32 31 | [-1, 1, InvertedResidual, [160, 960, 5, 1, 1, 1]], # 15-p5/32 32 | ] 33 | 34 | 35 | # YOLOv5 head 36 | # the head is still yolov5s, but depth_multiple and width_multiple are set to 1.0, so Conv and C3 need to be changed 37 | # depth_multiple of yolov5s: 0.33, i.e., number of C3: 3 -> 1 38 | # width_multiple of yolov5s: 0.50, i.e., halve the ch_out 39 | head: 40 | [[-1, 1, Conv, [256, 1, 1]], 41 | [-1, 1, nn.Upsample, [None, 2, 'nearest']], 42 | [[-1, 13], 1, Concat, [1]], # cat backbone P4 43 | [-1, 1, C3, [256, False]], # 19 44 | 45 | [-1, 1, Conv, [128, 1, 1]], 46 | [-1, 1, nn.Upsample, [None, 2, 'nearest']], 47 | [[-1, 6], 1, Concat, [1]], # cat backbone P3 48 | [-1, 1, C3, [128, False]], # 23 (P3/8-small) 49 | 50 | [-1, 1, Conv, [128, 3, 2]], 51 | [[-1, 20], 1, Concat, [1]], # cat head P4 52 | [-1, 1, C3, [256, False]], # 26 (P4/16-medium) 53 | 54 | [-1, 1, Conv, [256, 3, 2]], 55 | [[-1, 16], 1, Concat, [1]], # cat head P5 56 | [-1, 1, C3, [512, False]], # 29 (P5/32-large) 57 | 58 | [[23, 26, 29], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) 59 | ] -------------------------------------------------------------------------------- /models/yolov5s-mv3s.yaml: -------------------------------------------------------------------------------- 1 | # parameters 2 | nc: 2 # number of classes 3 | depth_multiple: 1.0 # dont change this otherwise InvertedResidual will be affected 4 | width_multiple: 1.0 # dont change this otherwise InvertedResidual will be affected 5 | 6 | # anchors 7 | anchors: 8 | - [10,13, 16,30, 33,23] # P3/8 9 | - [30,61, 62,45, 59,119] # P4/16 10 | - [116,90, 156,198, 373,326] # P5/32 11 | 12 | # YOLOv5 backbone 13 | backbone: 14 | # MobileNetV3-small 15 | # [from, number, module, args] 16 | [[-1, 1, Conv3BN, [16, 2]], # 0-p1/2 17 | [-1, 1, InvertedResidual, [16, 16, 3, 2, 1, 0]], # 1-p2/4 18 | [-1, 1, InvertedResidual, [24, 72, 3, 2, 0, 0]], # 2-p3/8 19 | [-1, 1, InvertedResidual, [24, 88, 3, 1, 0, 0]], # 3-p3/8 20 | [-1, 1, InvertedResidual, [40, 96, 5, 2, 1, 1]], # 4-p4/16 21 | [-1, 1, InvertedResidual, [40, 240, 5, 1, 1, 1]], # 5-p4/16 22 | [-1, 1, InvertedResidual, [40, 240, 5, 1, 1, 1]], # 6-p4/16 23 | [-1, 1, InvertedResidual, [48, 120, 5, 1, 1, 1]], # 7-p4/16 24 | [-1, 1, InvertedResidual, [48, 144, 5, 1, 1, 1]], # 8-p4/16 25 | [-1, 1, InvertedResidual, [96, 288, 5, 2, 1, 1]], # 9-p5/32 26 | [-1, 1, InvertedResidual, [96, 576, 5, 1, 1, 1]], # 10-p5/32 27 | [-1, 1, InvertedResidual, [96, 576, 5, 1, 1, 1]], # 11-p5/32 28 | ] 29 | 30 | # YOLOv5 head 31 | # the head is still yolov5s, but depth_multiple and width_multiple are set to 1.0, so Conv and C3 need to be changed 32 | # depth_multiple of yolov5s: 0.33, i.e., number of C3: 3 -> 1 33 | # width_multiple of yolov5s: 0.50, i.e., halve the ch_out 34 | head: 35 | [[-1, 1, Conv, [256, 1, 1]], 36 | [-1, 1, nn.Upsample, [None, 2, 'nearest']], 37 | [[-1, 8], 1, Concat, [1]], # cat backbone P4 38 | [-1, 1, C3, [256, False]], # 15 39 | 40 | [-1, 1, Conv, [128, 1, 1]], 41 | [-1, 1, nn.Upsample, [None, 2, 'nearest']], 42 | [[-1, 3], 1, Concat, [1]], # cat backbone P3 43 | [-1, 1, C3, [128, False]], # 19 (P3/8-small) 44 | 45 | [-1, 1, Conv, [128, 3, 2]], 46 | [[-1, 16], 1, Concat, [1]], # cat head P4 47 | [-1, 1, C3, [256, False]], # 22 (P4/16-medium) 48 | 49 | [-1, 1, Conv, [256, 3, 2]], 50 | [[-1, 12], 1, Concat, [1]], # cat head P5 51 | [-1, 1, C3, [512, False]], # 25 (P5/32-large) 52 | 53 | [[19, 22, 25], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) 54 | ] --------------------------------------------------------------------------------