├── .gitignore ├── CFENet └── scripts │ ├── cfenet.py │ ├── datagen.py │ ├── encoder.py │ ├── multibox_layer.py │ ├── multibox_loss.py │ ├── train.py │ └── util.py ├── MDSSD_300 └── scripts │ ├── datagen.py │ ├── encoder.py │ ├── fusion.py │ ├── mdssd.py │ ├── multibox_layer.py │ ├── multibox_loss.py │ ├── norm.py │ ├── test.py │ └── train.py ├── MDSSD_512 └── scripts │ ├── datagen.py │ ├── encoder.py │ ├── fusion.py │ ├── mdssd.py │ ├── multibox_layer.py │ ├── multibox_loss.py │ ├── norm.py │ ├── test.py │ └── train.py ├── MDSSD_augment └── scripts │ ├── augment.py │ ├── datagen.py │ ├── encoder.py │ ├── fusion.py │ ├── mdssd.py │ ├── multibox_layer.py │ ├── multibox_loss.py │ ├── norm.py │ ├── test.py │ └── train.py ├── MDSSD_with_self_attention └── scripts │ ├── attention.py │ ├── datagen.py │ ├── encoder.py │ ├── fusion.py │ ├── gen_test_file.py │ ├── mdssd.py │ ├── multibox_layer.py │ ├── multibox_loss.py │ ├── norm.py │ ├── test.py │ └── train.py ├── README.md └── SSD └── scripts ├── convert_vgg.py ├── datagen.py ├── encoder.py ├── multibox_layer.py ├── multibox_loss.py ├── ssd.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /CFENet/scripts/cfenet.py: -------------------------------------------------------------------------------- 1 | import math 2 | import itertools 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torch.nn.init as init 8 | 9 | from torch.autograd import Variable 10 | 11 | from multibox_layer import MultiBoxLayer 12 | from util import CFE, FFB 13 | 14 | 15 | class L2Norm(nn.Module): 16 | '''L2Norm layer across all channels and scale.''' 17 | def __init__(self, in_features,scale): 18 | super(L2Norm, self).__init__() 19 | self.weight = nn.Parameter(torch.Tensor(in_features)) 20 | self.reset_parameters(scale) 21 | 22 | def reset_parameters(self, scale): 23 | nn.init.constant_(self.weight, scale) 24 | 25 | def forward(self, x): 26 | x = F.normalize(x, dim=1) 27 | scale = self.weight[None,:,None,None] 28 | return scale * x 29 | 30 | 31 | class CFENet(nn.Module): 32 | input_size = 300 33 | 34 | def __init__(self): 35 | super(CFENet, self).__init__() 36 | 37 | # model 38 | self.base = self.VGG16() 39 | self.norm4 = L2Norm(512, 20) # 38 40 | 41 | self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1) 42 | self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1) 43 | self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1) 44 | 45 | self.conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) 46 | 47 | self.conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 48 | 49 | self.conv8_1 = nn.Conv2d(1024, 256, kernel_size=1) 50 | self.conv8_2 = nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2) 51 | 52 | self.conv9_1 = nn.Conv2d(512, 128, kernel_size=1) 53 | self.conv9_2 = nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2) 54 | 55 | self.conv10_1 = nn.Conv2d(256, 128, kernel_size=1) 56 | self.conv10_2 = nn.Conv2d(128, 256, kernel_size=3) 57 | 58 | self.conv11_1 = nn.Conv2d(256, 128, kernel_size=1) 59 | self.conv11_2 = nn.Conv2d(128, 256, kernel_size=3) 60 | 61 | # CFE 62 | self.cfe1 = CFE(512, 3) 63 | self.cfe2 = CFE(512, 3) 64 | self.cfe3 = CFE(512, 3) 65 | self.cfe4 = CFE(512, 3) 66 | 67 | # FFB 68 | self.ffb1 = FFB(512,512) 69 | self.ffb2 = FFB(512,512) 70 | 71 | # multibox layer 72 | self.multibox = MultiBoxLayer() 73 | 74 | def forward(self, x): 75 | hs = [] 76 | ffb = [] 77 | 78 | h = self.base(x) 79 | # hs.append(self.norm4(h)) # conv4_3 80 | ffb.append(h) 81 | h = self.cfe1(h) 82 | h = F.max_pool2d(h, kernel_size=2, stride=2, ceil_mode=True) 83 | 84 | h = F.relu(self.conv5_1(h)) 85 | h = F.relu(self.conv5_2(h)) 86 | h = F.relu(self.conv5_3(h)) 87 | ffb.append(h) 88 | h = self.cfe2(h) 89 | h = F.max_pool2d(h, kernel_size=3, padding=1, stride=1, ceil_mode=True) 90 | 91 | h = F.relu(self.conv6(h)) 92 | h = F.relu(self.conv7(h)) 93 | # hs.append(h) # conv7 94 | h = F.relu(self.conv8_1(h)) 95 | h = F.relu(self.conv8_2(h)) 96 | hs.append(h) # conv8_2 97 | ffb.append(h) 98 | h = F.relu(self.conv9_1(h)) 99 | h = F.relu(self.conv9_2(h)) 100 | hs.append(h) # conv9_2 101 | h = F.relu(self.conv10_1(h)) 102 | h = F.relu(self.conv10_2(h)) 103 | hs.append(h) # conv10_2 104 | h = F.relu(self.conv11_1(h)) 105 | h = F.relu(self.conv11_2(h)) 106 | hs.append(h) # conv11_2 107 | 108 | # Feature fusion blocks followed by Comprehensive Feature Enhancement(CFE) module 109 | f1 = self.ffb1(ffb[0], ffb[1]) 110 | f1 = self.cfe3(f1) 111 | hs.append(f1) 112 | f2 = self.ffb2(ffb[1], ffb[2]) 113 | f2 = self.cfe4(f2) 114 | hs.append(f2) 115 | 116 | loc_preds, conf_preds = self.multibox(hs) 117 | 118 | return loc_preds, conf_preds 119 | 120 | def VGG16(self): 121 | '''VGG16 layers.''' 122 | cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512] 123 | layers = [] 124 | in_channels = 3 125 | for x in cfg: 126 | if x == 'M': 127 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 128 | else: 129 | layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1), 130 | nn.ReLU(True)] 131 | in_channels = x 132 | return nn.Sequential(*layers) 133 | 134 | if __name__ == '__main__': 135 | t = torch.randn(1, 3, 300, 300) 136 | net = CFENet() 137 | # res = net.forward(t) 138 | print(net.base) 139 | -------------------------------------------------------------------------------- /CFENet/scripts/datagen.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Load image/class/box from a annotation file. 3 | 4 | ''' 5 | from __future__ import print_function 6 | 7 | import os 8 | import sys 9 | import os.path 10 | 11 | import random 12 | import numpy as np 13 | 14 | import torch 15 | import torch.utils.data as data 16 | import torchvision.transforms as transforms 17 | 18 | from encoder import DataEncoder 19 | import cv2 20 | 21 | class ListDataset(data.Dataset): 22 | img_size = 300 23 | 24 | def __init__(self, root, list_file, train, transform): 25 | ''' 26 | Args: 27 | root: (str) ditectory to images. 28 | list_file: (str) path to annotation files. 29 | train: (boolean) train or test. 30 | transform: ([transforms]) image transforms. 31 | ''' 32 | self.root = root 33 | self.list_file = list_file 34 | self.train = train 35 | self.transform = transform 36 | 37 | self.fnames = [] 38 | self.boxes = [] 39 | self.labels = [] 40 | 41 | self.data_encoder = DataEncoder() 42 | self.num_samples = 0 43 | 44 | # VisDrone 45 | 46 | for i in os.listdir(list_file): 47 | self.num_samples += 1 48 | self.fnames.append(i) 49 | 50 | def __getitem__(self, idx): 51 | '''Load a image, and encode its bbox locations and class labels. 52 | Args: 53 | idx: (int) image index. 54 | Returns: 55 | img: (tensor) image tensor. 56 | loc_target: (tensor) location targets, sized [8732,4]. 57 | conf_target: (tensor) label targets, sized [8732,]. 58 | ''' 59 | # Load image and bbox locations. 60 | fname = self.fnames[idx] 61 | img = cv2.imread(os.path.join(self.root, fname[:-4]+".jpg")) 62 | 63 | box = [] 64 | label = [] 65 | with open(os.path.join(self.list_file,fname)) as f: 66 | f = f.read().split("\n") 67 | f = f[:-1] 68 | num_objs = len(f) 69 | 70 | for j in range(num_objs): 71 | f[j] = f[j].split(",") 72 | xmin = float(f[j][0]) 73 | ymin = float(f[j][1]) 74 | w = float(f[j][2]) 75 | h = float(f[j][3]) 76 | 77 | box.append([xmin,ymin,xmin+w,ymin+h]) 78 | label.append(int(f[j][5])) 79 | 80 | ''' 81 | # **************************** AUGMENTATION ************************************ 82 | # Copy and paste small objects at random locations in 83 | # image to increase the number of samples with small sizes. 84 | box_new = box.copy() 85 | label_new = label.copy() 86 | img_new = img.copy() 87 | for n in range(len(box)): 88 | j = box[n] 89 | if j[2]*j[3]<500: 90 | crop = img[int(j[1]):int(j[1]+j[3]),int(j[0]):int(j[0]+j[2])] 91 | x = random.randrange(0, img.shape[1],1) 92 | y = random.randrange(0, img.shape[0],1) 93 | 94 | try: 95 | img_new[int(y):int(y+j[3]),int(x):int(x+j[2])] = crop 96 | box_new.append([x,y,j[2],j[3]]) 97 | label_new.append(label[n]) 98 | except: 99 | continue 100 | 101 | # ******************************************************************************** 102 | 103 | self.boxes.append(torch.Tensor(box_new)) 104 | self.labels.append(torch.LongTensor(label_new)) 105 | img = img_new 106 | ''' 107 | 108 | self.boxes.append(torch.Tensor(box)) 109 | self.labels.append(torch.LongTensor(label)) 110 | 111 | boxes = self.boxes[-1].clone() 112 | labels = self.labels[-1] 113 | 114 | # Data augmentation while training. 115 | if self.train: 116 | img, boxes = self.random_flip(img, boxes) 117 | img, boxes, labels = self.random_crop(img, boxes, labels) 118 | 119 | # Scale bbox locaitons to [0,1]. 120 | w,h = img.shape[1], img.shape[0] 121 | boxes /= torch.Tensor([w,h,w,h]).expand_as(boxes) 122 | img = cv2.resize(img, (self.img_size,self.img_size)) 123 | img = self.transform(img) 124 | 125 | # Encode loc & conf targets. 126 | 127 | loc_target, conf_target = self.data_encoder.encode(boxes, labels) 128 | return img, loc_target, conf_target 129 | 130 | def random_flip(self, img, boxes): 131 | '''Randomly flip the image and adjust the bbox locations. 132 | For bbox (xmin, ymin, xmax, ymax), the flipped bbox is: 133 | (w-xmax, ymin, w-xmin, ymax). 134 | Args: 135 | img: (ndarray.Image) image. f 136 | boxes: (tensor) bbox locations, sized [#obj, 4]. 137 | Returns: 138 | img: (ndarray.Image) randomly flipped image. 139 | boxes: (tensor) randomly flipped bbox locations, sized [#obj, 4]. 140 | ''' 141 | if random.random() < 0.5: 142 | img = cv2.flip(img, 1) 143 | w = img.shape[1] 144 | xmin = w - boxes[:,2] 145 | xmax = w - boxes[:,0] 146 | boxes[:,0] = xmin 147 | boxes[:,2] = xmax 148 | return img, boxes 149 | 150 | def random_crop(self, img, boxes, labels): 151 | '''Randomly crop the image and adjust the bbox locations. 152 | For more details, see 'Chapter2.2: Data augmentation' of the paper. 153 | Args: 154 | img: (ndarray.Image) image. 155 | boxes: (tensor) bbox locations, sized [#obj, 4]. 156 | labels: (tensor) bbox labels, sized [#obj,]. 157 | Returns: 158 | img: (ndarray.Image) cropped image. 159 | selected_boxes: (tensor) selected bbox locations. 160 | labels: (tensor) selected bbox labels. 161 | ''' 162 | imw, imh = img.shape[1], img.shape[0] 163 | while True: 164 | min_iou = random.choice([None, 0.1, 0.3, 0.5, 0.7, 0.9])# random choice the one 165 | if min_iou is None: 166 | return img, boxes, labels 167 | 168 | for _ in range(100): 169 | w = random.randrange(int(0.1*imw), imw) 170 | h = random.randrange(int(0.1*imh), imh) 171 | 172 | if h > 2*w or w > 2*h or h < 1 or w < 1: 173 | continue 174 | 175 | x = random.randrange(imw - w) 176 | y = random.randrange(imh - h) 177 | roi = torch.Tensor([[x, y, x+w, y+h]]) 178 | 179 | center = (boxes[:,:2] + boxes[:,2:]) / 2 # [N,2] 180 | roi2 = roi.expand(len(center), 4) # [N,4] 181 | 182 | mask = (center > roi2[:,:2]) & (center < roi2[:,2:]) # [N,2] 183 | mask = mask[:,0] & mask[:,1] #[N,] 184 | 185 | if not mask.any(): 186 | continue 187 | 188 | selected_boxes = boxes.index_select(0, mask.nonzero().squeeze(1)) 189 | 190 | iou = self.data_encoder.iou(selected_boxes, roi) 191 | if iou.min() < min_iou: 192 | continue 193 | img = img[y:y+h, x:x+w, :] 194 | 195 | selected_boxes[:,0].add_(-x).clamp_(min=0, max=w) 196 | selected_boxes[:,1].add_(-y).clamp_(min=0, max=h) 197 | selected_boxes[:,2].add_(-x).clamp_(min=0, max=w) 198 | selected_boxes[:,3].add_(-y).clamp_(min=0, max=h) 199 | 200 | return img, selected_boxes, labels[mask] 201 | 202 | def __len__(self): 203 | return self.num_samples -------------------------------------------------------------------------------- /CFENet/scripts/encoder.py: -------------------------------------------------------------------------------- 1 | '''Encode target locations and labels.''' 2 | import torch 3 | 4 | import math 5 | import itertools 6 | 7 | class DataEncoder: 8 | def __init__(self): 9 | '''Compute default box sizes with scale and aspect transform.''' 10 | scale = 300. 11 | steps = [s / scale for s in (32, 64, 100, 300, 8, 16)] 12 | sizes = [s / scale for s in (111, 162, 213, 264, 315, 30, 60)] 13 | aspect_ratios = ((2,3), (2,3), (2,), (2,), (2,), (2,3)) 14 | feature_map_sizes = (10, 5, 3, 1, 38, 19) 15 | num_layers = len(feature_map_sizes) 16 | 17 | boxes = [] 18 | for i in range(num_layers): 19 | fmsize = feature_map_sizes[i] # feature map size 20 | for h,w in itertools.product(range(fmsize), repeat=2): 21 | cx = (w + 0.5)*steps[i] 22 | cy = (h + 0.5)*steps[i] 23 | 24 | s = sizes[i] 25 | boxes.append((cx, cy, s, s)) 26 | 27 | s = math.sqrt(sizes[i] * sizes[i+1]) 28 | boxes.append((cx, cy, s, s)) 29 | 30 | s = sizes[i] 31 | for ar in aspect_ratios[i]: 32 | boxes.append((cx, cy, s * math.sqrt(ar), s / math.sqrt(ar))) 33 | boxes.append((cx, cy, s / math.sqrt(ar), s * math.sqrt(ar))) 34 | 35 | self.default_boxes = torch.Tensor(boxes) 36 | 37 | def iou(self, box1, box2): 38 | '''Compute the intersection over union of two set of boxes, each box is [x1,y1,x2,y2]. 39 | 40 | Args: 41 | box1: (tensor) bounding boxes, sized [N,4]. 42 | box2: (tensor) bounding boxes, sized [M,4]. 43 | 44 | Return: 45 | (tensor) iou, sized [N,M]. 46 | ''' 47 | N = box1.size(0) 48 | M = box2.size(0) 49 | 50 | lt = torch.max( 51 | box1[:,:2].unsqueeze(1).expand(N,M,2), # [N,2] -> [N,1,2] -> [N,M,2] 52 | box2[:,:2].unsqueeze(0).expand(N,M,2), # [M,2] -> [1,M,2] -> [N,M,2] 53 | ) 54 | 55 | rb = torch.min( 56 | box1[:,2:].unsqueeze(1).expand(N,M,2), # [N,2] -> [N,1,2] -> [N,M,2] 57 | box2[:,2:].unsqueeze(0).expand(N,M,2), # [M,2] -> [1,M,2] -> [N,M,2] 58 | ) 59 | 60 | wh = rb - lt # [N,M,2] 61 | wh[wh<0] = 0 # clip at 0 62 | inter = wh[:,:,0] * wh[:,:,1] # [N,M] 63 | 64 | area1 = (box1[:,2]-box1[:,0]) * (box1[:,3]-box1[:,1]) # [N,] 65 | area2 = (box2[:,2]-box2[:,0]) * (box2[:,3]-box2[:,1]) # [M,] 66 | area1 = area1.unsqueeze(1).expand_as(inter) # [N,] -> [N,1] -> [N,M] 67 | area2 = area2.unsqueeze(0).expand_as(inter) # [M,] -> [1,M] -> [N,M] 68 | 69 | iou = inter / (area1 + area2 - inter) 70 | return iou 71 | 72 | def encode(self, boxes, classes, threshold=0.5): 73 | '''Transform target bounding boxes and class labels to SSD boxes and classes. 74 | 75 | Match each object box to all the default boxes, pick the ones with the 76 | Jaccard-Index > 0.5: 77 | Jaccard(A,B) = AB / (A+B-AB) 78 | 79 | Args: 80 | boxes: (tensor) object bounding boxes (xmin,ymin,xmax,ymax) of a image, sized [#obj, 4]. 81 | classes: (tensor) object class labels of a image, sized [#obj,]. 82 | threshold: (float) Jaccard index threshold 83 | 84 | Returns: 85 | boxes: (tensor) bounding boxes, sized [#obj, 8732, 4]. 86 | classes: (tensor) class labels, sized [8732,] 87 | ''' 88 | default_boxes = self.default_boxes 89 | num_default_boxes = default_boxes.size(0) 90 | num_objs = boxes.size(0) 91 | 92 | iou = self.iou( # [#obj,8732] 93 | boxes, 94 | torch.cat([default_boxes[:,:2] - default_boxes[:,2:]/2, 95 | default_boxes[:,:2] + default_boxes[:,2:]/2], 1) 96 | ) 97 | 98 | iou, max_idx = iou.max(0) # [1,8732] 99 | max_idx.squeeze_(0) # [8732,] 100 | iou.squeeze_(0) # [8732,] 101 | 102 | boxes = boxes[max_idx] # [8732,4] 103 | variances = [0.1, 0.2] 104 | cxcy = (boxes[:,:2] + boxes[:,2:])/2 - default_boxes[:,:2] # [8732,2] 105 | cxcy /= variances[0] * default_boxes[:,2:] 106 | wh = (boxes[:,2:] - boxes[:,:2]) / default_boxes[:,2:] # [8732,2] 107 | wh = torch.log(wh) / variances[1] 108 | loc = torch.cat([cxcy, wh], 1) # [8732,4] 109 | 110 | conf = 1 + classes[max_idx] # [8732,], background class = 0 111 | conf[iou 0: 139 | while order.size > 0: 140 | try: 141 | i = order[0] 142 | except: 143 | i = order 144 | keep.append(i) 145 | 146 | # if order.numel() == 1: 147 | if order.size == 1: 148 | break 149 | 150 | xx1 = x1[order[1:]].clamp(min=x1[i]) 151 | yy1 = y1[order[1:]].clamp(min=y1[i]) 152 | xx2 = x2[order[1:]].clamp(max=x2[i]) 153 | yy2 = y2[order[1:]].clamp(max=y2[i]) 154 | 155 | w = (xx2-xx1).clamp(min=0) 156 | h = (yy2-yy1).clamp(min=0) 157 | inter = w*h 158 | 159 | if mode == 'union': 160 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 161 | elif mode == 'min': 162 | ovr = inter / areas[order[1:]].clamp(max=areas[i]) 163 | else: 164 | raise TypeError('Unknown nms mode: %s.' % mode) 165 | 166 | ids = (ovr<=threshold).nonzero().squeeze() 167 | # if ids.numel() == 0: 168 | if order.size == 0: 169 | break 170 | order = order[ids+1] 171 | # except: 172 | # print(order) 173 | # break 174 | return torch.LongTensor(keep) 175 | 176 | def decode(self, loc, conf): 177 | '''Transform predicted loc/conf back to real bbox locations and class labels. 178 | 179 | Args: 180 | loc: (tensor) predicted loc, sized [8732,4]. 181 | conf: (tensor) predicted conf, sized [8732,21]. 182 | 183 | Returns: 184 | boxes: (tensor) bbox locations, sized [#obj, 4]. 185 | labels: (tensor) class labels, sized [#obj,1]. 186 | ''' 187 | variances = (0.1, 0.2) 188 | # print(loc.size(), conf.size()) 189 | wh = torch.exp(loc[:,2:]*variances[1]) * self.default_boxes[:,2:] 190 | cxcy = loc[:,:2] * variances[0] * self.default_boxes[:,2:] + self.default_boxes[:,:2] 191 | box_preds = torch.cat([cxcy-wh/2, cxcy+wh/2], 1) # [8732,4] 192 | 193 | boxes = [] 194 | labels = [] 195 | scores = [] 196 | num_classes = conf.size(1) 197 | for i in range(num_classes-1): 198 | score = conf[:,i+1] # class i corresponds to (i+1) column 199 | mask = score > 0.1 200 | 201 | if not mask.any(): 202 | continue 203 | 204 | box = box_preds[mask.nonzero().squeeze()] 205 | score = score[mask] 206 | 207 | if len(score) == 1: 208 | continue 209 | 210 | keep = self.nms(box, score, threshold=0.3) 211 | boxes.append(box[keep]) 212 | labels.append(torch.LongTensor(len(box[keep])).fill_(i)) 213 | scores.append(score[keep]) 214 | 215 | return boxes, labels, scores 216 | -------------------------------------------------------------------------------- /CFENet/scripts/multibox_layer.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import math 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.init as init 8 | import torch.nn.functional as F 9 | 10 | 11 | class MultiBoxLayer(nn.Module): 12 | num_classes = 13 13 | num_anchors = [6,6,4,4,4,6] 14 | in_planes = [512,256,256,256,512,512] 15 | 16 | def __init__(self): 17 | super(MultiBoxLayer, self).__init__() 18 | 19 | self.loc_layers = nn.ModuleList() 20 | self.conf_layers = nn.ModuleList() 21 | for i in range(len(self.in_planes)): 22 | self.loc_layers.append(nn.Conv2d(self.in_planes[i], self.num_anchors[i]*4, kernel_size=3, padding=1)) 23 | self.conf_layers.append(nn.Conv2d(self.in_planes[i], self.num_anchors[i]*13, kernel_size=3, padding=1)) 24 | 25 | def forward(self, xs): 26 | ''' 27 | Args: 28 | xs: (list) of tensor containing intermediate layer outputs. 29 | 30 | Returns: 31 | loc_preds: (tensor) predicted locations, sized [N,8732,4]. 32 | conf_preds: (tensor) predicted class confidences, sized [N,8732,21]. 33 | ''' 34 | y_locs = [] 35 | y_confs = [] 36 | for i, x in enumerate(xs): 37 | y_loc = self.loc_layers[i](x) 38 | N = y_loc.size(0) 39 | y_loc = y_loc.permute(0,2,3,1).contiguous() 40 | y_loc = y_loc.view(N,-1,4) 41 | y_locs.append(y_loc) 42 | 43 | y_conf = self.conf_layers[i](x) 44 | y_conf = y_conf.permute(0,2,3,1).contiguous() 45 | y_conf = y_conf.view(N,-1,13) 46 | y_confs.append(y_conf) 47 | 48 | loc_preds = torch.cat(y_locs, 1) 49 | conf_preds = torch.cat(y_confs, 1) 50 | return loc_preds, conf_preds 51 | -------------------------------------------------------------------------------- /CFENet/scripts/multibox_loss.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import math 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.init as init 8 | import torch.nn.functional as F 9 | 10 | 11 | class MultiBoxLoss(nn.Module): 12 | num_classes = 13 13 | 14 | def __init__(self): 15 | super(MultiBoxLoss, self).__init__() 16 | 17 | def cross_entropy_loss(self, x, y): 18 | '''Cross entropy loss w/o averaging across all samples. 19 | 20 | Args: 21 | x: (tensor) sized [N,D]. 22 | y: (tensor) sized [N,]. 23 | 24 | Return: 25 | (tensor) cross entroy loss, sized [N,]. 26 | ''' 27 | xmax = x.data.max() 28 | print('x y size {} {}'.format(x.size(), y.size())) 29 | log_sum_exp = torch.log(torch.sum(torch.exp(x-xmax), 1)) + xmax 30 | print('log_sum_exp {}'.format(log_sum_exp.size())) 31 | return log_sum_exp - x.gather(1, y.view(-1,1)) 32 | 33 | def hard_negative_mining(self, conf_loss, pos): 34 | '''Return negative indices that is 3x the number as postive indices. 35 | 36 | Args: 37 | conf_loss: (tensor) cross entroy loss between conf_preds and conf_targets, sized [N*8732,]. 38 | pos: (tensor) positive(matched) box indices, sized [N,8732]. 39 | 40 | Return: 41 | (tensor) negative indices, sized [N,8732]. 42 | ''' 43 | batch_size, num_boxes = pos.size() 44 | # print(pos.size()) 45 | conf_loss[pos.view(-1)] = 0 # set pos boxes = 0, the rest are neg conf_loss 46 | conf_loss = conf_loss.view(batch_size, -1) # [N,8732] 47 | 48 | _,idx = conf_loss.sort(1, descending=True) # sort by neg conf_loss 49 | _,rank = idx.sort(1) # [N,8732] 50 | 51 | num_pos = pos.long().sum(1) # [N,1] 52 | num_neg = torch.clamp(3*num_pos, max=num_boxes-1) # [N,1] 53 | 54 | neg = rank < num_neg.unsqueeze(1).expand_as(rank) 55 | 56 | return neg 57 | 58 | def forward(self, loc_preds, loc_targets, conf_preds, conf_targets): 59 | '''Compute loss between (loc_preds, loc_targets) and (conf_preds, conf_targets). 60 | 61 | Args: 62 | loc_preds: (tensor) predicted locations, sized [batch_size, 8732, 4]. 63 | loc_targets: (tensor) encoded target locations, sized [batch_size, 8732, 4]. 64 | conf_preds: (tensor) predicted class confidences, sized [batch_size, 8732, num_classes]. 65 | conf_targets: (tensor) encoded target classes, sized [batch_size, 8732]. 66 | 67 | loss: 68 | (tensor) loss = SmoothL1Loss(loc_preds, loc_targets) + CrossEntropyLoss(conf_preds, conf_targets). 69 | ''' 70 | batch_size, num_boxes, _ = loc_preds.size() 71 | pos = conf_targets > 0 # [N,8732], pos means the box matched. 72 | num_matched_boxes = pos.data.float().sum() 73 | if num_matched_boxes == 0: 74 | return torch.tensor([0.], requires_grad=True) 75 | 76 | ################################################################ 77 | # loc_loss = SmoothL1Loss(pos_loc_preds, pos_loc_targets) 78 | ################################################################ 79 | pos_mask = pos.unsqueeze(2).expand_as(loc_preds) # [N,8732,4] 80 | pos_loc_preds = loc_preds[pos_mask].view(-1,4) # [#pos,4] 81 | pos_loc_targets = loc_targets[pos_mask].view(-1,4) # [#pos,4] 82 | 83 | loc_loss = F.smooth_l1_loss(pos_loc_preds, pos_loc_targets, size_average=False) 84 | 85 | ################################################################ 86 | # conf_loss = CrossEntropyLoss(pos_conf_preds, pos_conf_targets) 87 | # + CrossEntropyLoss(neg_conf_preds, neg_conf_targets) 88 | ################################################################ 89 | # print(conf_targets.view(-1).size()) 90 | conf_loss = F.cross_entropy(conf_preds.view(-1,self.num_classes), \ 91 | conf_targets.view(-1), reduce=False) # [N*8732,] 92 | neg = self.hard_negative_mining(conf_loss, pos) # [N,8732] 93 | 94 | pos_mask = pos.unsqueeze(2).expand_as(conf_preds) # [N,8732,21] 95 | neg_mask = neg.unsqueeze(2).expand_as(conf_preds) # [N,8732,21] 96 | mask = (pos_mask+neg_mask).gt(0) 97 | 98 | pos_and_neg = (pos+neg).gt(0) 99 | preds = conf_preds[mask].view(-1,self.num_classes) # [#pos+#neg,21] 100 | targets = conf_targets[pos_and_neg] # [#pos+#neg,] 101 | conf_loss = F.cross_entropy(preds, targets, size_average=False) 102 | 103 | loc_loss /= num_matched_boxes 104 | conf_loss /= num_matched_boxes 105 | 106 | return loc_loss + conf_loss 107 | -------------------------------------------------------------------------------- /CFENet/scripts/train.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | import argparse 5 | import itertools 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.optim as optim 10 | import torch.nn.functional as F 11 | import torch.backends.cudnn as cudnn 12 | 13 | import torchvision 14 | import torchvision.transforms as transforms 15 | 16 | import numpy as np 17 | from cfenet import CFENet 18 | from datagen import ListDataset 19 | from multibox_loss import MultiBoxLoss 20 | 21 | lr = 0.001 22 | resume = False # Resume from checkpoint 23 | epoch = 200 24 | 25 | use_cuda = torch.cuda.is_available() 26 | best_loss = float('inf') # best test loss 27 | start_epoch = 0 # start from epoch 0 or last epoch 28 | 29 | # Data 30 | print('==> Preparing data..') 31 | transform = transforms.Compose([transforms.ToTensor(), 32 | transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))]) 33 | 34 | trainset = ListDataset(root='../../../../VisDrone2019/dataset/VisDrone2018-DET-train/images/', list_file='../../../../VisDrone2019/dataset/VisDrone2018-DET-train/annotations/', train=True, transform=transform) 35 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=8, shuffle=True, num_workers=4) 36 | 37 | valset = ListDataset(root='../../../../VisDrone2019/dataset/VisDrone2019-DET-val/images/', list_file='../../../../VisDrone2019/dataset/VisDrone2019-DET-val/annotations/', train=True, transform=transform) 38 | valloader = torch.utils.data.DataLoader(valset, batch_size=8, shuffle=True, num_workers=4) 39 | 40 | 41 | # Model 42 | net = CFENet() 43 | if resume: 44 | print('==> Resuming from checkpoint..') 45 | checkpoint = torch.load('./checkpoint/ckpt.pth') 46 | 47 | keys = [] 48 | for k,v in checkpoint['net'].items(): 49 | if "module" in k: 50 | keys.append(k) 51 | for i in keys: 52 | checkpoint['net'][i[7:]] = checkpoint['net'][i] 53 | del checkpoint['net'][i] 54 | 55 | net.load_state_dict(checkpoint['net']) 56 | best_loss = checkpoint['loss'] 57 | start_epoch = checkpoint['epoch'] 58 | else: 59 | # Convert from pretrained VGG model. 60 | try: 61 | net.load_state_dict(torch.load('../model/ssd.pth')) 62 | print('==> Pretrain model read successfully') 63 | except: 64 | print('==> Pretrain model read failed or not existed, training from init') 65 | 66 | 67 | criterion = MultiBoxLoss() 68 | 69 | if use_cuda: 70 | net = torch.nn.DataParallel(net, device_ids=[0]) 71 | net.cuda() 72 | cudnn.benchmark = True 73 | 74 | optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4) 75 | 76 | # Training 77 | def train(epoch, prev_val_loss, last_saved): 78 | print('\nEpoch: %d' % epoch) 79 | net.train() 80 | train_loss = 0 81 | 82 | for batch_idx, (images, loc_targets, conf_targets) in enumerate(trainloader): 83 | if use_cuda: 84 | images = images.cuda() 85 | loc_targets = loc_targets.cuda() 86 | conf_targets = conf_targets.cuda() 87 | 88 | images = torch.tensor(images) 89 | loc_targets = torch.tensor(loc_targets) 90 | conf_targets = torch.tensor(conf_targets) 91 | 92 | optimizer.zero_grad() 93 | loc_preds, conf_preds = net(images) 94 | loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets) 95 | loss.backward() 96 | optimizer.step() 97 | 98 | train_loss += loss.item() 99 | if batch_idx%100 == 0: 100 | val_loss_tot = 0 101 | for batch_idx_val, (images, loc_targets, conf_targets) in enumerate(valloader): 102 | if use_cuda: 103 | images = images.cuda() 104 | loc_targets = loc_targets.cuda() 105 | conf_targets = conf_targets.cuda() 106 | 107 | images = torch.tensor(images) 108 | loc_targets = torch.tensor(loc_targets) 109 | conf_targets = torch.tensor(conf_targets) 110 | 111 | loc_preds, conf_preds = net(images) 112 | val_loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets) 113 | val_loss_tot += val_loss.item() 114 | 115 | val_loss_tot /= (batch_idx_val+1) 116 | if val_loss_tot < prev_val_loss: 117 | os.makedirs('checkpoint', exist_ok=True) 118 | torch.save({ 119 | 'epoch': epoch, 120 | 'net': net.state_dict(), 121 | 'loss': loss, 122 | }, 'checkpoint/ckpt.pth') 123 | print("Saved.") 124 | prev_val_loss = val_loss_tot 125 | last_saved = [epoch, batch_idx] 126 | print('epoch: {}, batch_idx: {},loss: {}, train_loss: {}, best_val_loss: {}, last_saved: {}'.format(epoch, batch_idx, loss.item(), train_loss/(batch_idx+1), prev_val_loss, last_saved)) 127 | 128 | return prev_val_loss, last_saved 129 | 130 | prev_val_loss = 999 131 | last_saved = [start_epoch,0] 132 | for epoch_num in range(start_epoch, start_epoch+epoch): 133 | prev_val_loss, last_saved = train(epoch_num, prev_val_loss, last_saved) 134 | -------------------------------------------------------------------------------- /CFENet/scripts/util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.nn.init as init 5 | 6 | from torch.autograd import Variable 7 | 8 | class CFE(nn.Module): 9 | def __init__(self, channels, k): 10 | super(CFE, self).__init__() 11 | 12 | # First branch 13 | self.conv1_1 = nn.Conv2d(channels, channels//2, kernel_size=1, padding=0) 14 | self.bn1 = nn.BatchNorm2d(channels//2) 15 | self.conv1_2 = nn.Conv2d(channels//2,channels//2, kernel_size=(k,1), padding=(k//2,0),groups=8) 16 | self.conv1_3 = nn.Conv2d(channels//2, channels//2, kernel_size=(1,k), padding=(0,k//2),groups=8) 17 | self.bn2 = nn.BatchNorm2d(channels//2) 18 | self.conv1_4 = nn.Conv2d(channels//2, channels//2, kernel_size=1, padding=0) 19 | self.bn3 = nn.BatchNorm2d(channels//2) 20 | 21 | # Second branch 22 | self.conv2_1 = nn.Conv2d(channels, channels//2, kernel_size=1, padding=0) 23 | self.bn4 = nn.BatchNorm2d(channels//2) 24 | self.conv2_2 = nn.Conv2d(channels//2,channels//2, kernel_size=(1,k), padding=(0,k//2),groups=8) 25 | self.conv2_3 = nn.Conv2d(channels//2, channels//2, kernel_size=(k,1), padding=(k//2,0),groups=8) 26 | self.bn5 = nn.BatchNorm2d(channels//2) 27 | self.conv2_4 = nn.Conv2d(channels//2, channels//2, kernel_size=1, padding=0) 28 | self.bn6 = nn.BatchNorm2d(channels//2) 29 | 30 | def forward(self, x): 31 | 32 | # First branch 33 | f = self.bn1(F.relu(self.conv1_1(x))) 34 | # print(f.size()) 35 | f = self.conv1_2(f) 36 | # print(f.size()) 37 | f = self.bn2(F.relu(self.conv1_3(f))) 38 | # print(f.size()) 39 | f = self.bn3(F.relu(self.conv1_4(f))) 40 | # print(f.size()) 41 | 42 | # Second branch 43 | s = self.bn4(F.relu(self.conv2_1(x))) 44 | # print(s.size()) 45 | s = self.conv2_2(s) 46 | # print(s.size()) 47 | s = self.bn5(F.relu(self.conv2_3(s))) 48 | # print(s.size()) 49 | s = self.bn6(F.relu(self.conv2_4(s))) 50 | # print(s.size()) 51 | 52 | fs = torch.cat((f,s), 1) 53 | # print(fs.size()) 54 | 55 | return (fs+x) 56 | 57 | class FFB(nn.Module): 58 | def __init__(self, c1, c2): 59 | super(FFB, self).__init__() 60 | 61 | self.conv1 = nn.Conv2d(c1, c1, kernel_size=1, padding=0) 62 | self.bn1 = nn.BatchNorm2d(c1) 63 | self.conv2 = nn.Conv2d(c2, c1, kernel_size=1, padding=0) 64 | self.bn2 = nn.BatchNorm2d(c1) 65 | self.deconv1 = nn.ConvTranspose2d(c1,c1, kernel_size=3, stride=2, padding=(1,1)) 66 | 67 | def forward(self, x1, x2): 68 | 69 | f = self.bn1(F.relu(self.conv1(x1))) 70 | # print(f.size()) 71 | s = self.bn2(F.relu(self.conv2(x2))) 72 | # print(s.size()) 73 | # s = F.upsample(s, scale_factor = 2) 74 | s = self.deconv1(s) 75 | # print(s.size()) 76 | 77 | # return(f+s) 78 | return f 79 | 80 | if __name__ == "__main__": 81 | x1 = torch.rand(1,512,38,38) 82 | x2 = torch.rand(1,1024,19,19) 83 | 84 | # model = CFE(512,3) 85 | model = FFB(512,1024) 86 | # x = model(x1) 87 | x = model(x1,x2) 88 | print(x.size()) 89 | 90 | 91 | x1 = torch.rand(1,512,19,19) 92 | x2 = torch.rand(1,1024,10,10) 93 | 94 | model = FFB(512,1024) 95 | x = model(x1,x2) 96 | print(x.size()) 97 | -------------------------------------------------------------------------------- /MDSSD_300/scripts/datagen.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Load image/class/box from a annotation file. 3 | 4 | ''' 5 | from __future__ import print_function 6 | 7 | import os 8 | import sys 9 | import os.path 10 | 11 | import random 12 | import numpy as np 13 | 14 | import torch 15 | import torch.utils.data as data 16 | import torchvision.transforms as transforms 17 | 18 | from encoder import DataEncoder 19 | import cv2 20 | 21 | import pandas as pd 22 | import shutil 23 | import os 24 | import numpy as np 25 | import glob 26 | import xml.etree.ElementTree as ET 27 | from xml.dom import minidom 28 | 29 | class ListDataset(data.Dataset): 30 | img_size = 300 31 | 32 | def __init__(self, root, list_file, train, transform): 33 | ''' 34 | Args: 35 | root: (str) ditectory to images. 36 | list_file: (str) path to annotation files. 37 | train: (boolean) train or test. 38 | transform: ([transforms]) image transforms. 39 | ''' 40 | self.root = root 41 | self.train = train 42 | self.transform = transform 43 | 44 | self.fnames = [] 45 | self.boxes = [] 46 | self.labels = [] 47 | 48 | self.data_encoder = DataEncoder() 49 | self.num_samples = 0 50 | 51 | # VisDrone 52 | 53 | for i in os.listdir(list_file): 54 | self.num_samples += 1 55 | self.fnames.append(i) 56 | box = [] 57 | labels = [] 58 | with open(os.path.join(list_file,i)) as f: 59 | f = f.read().split("\n") 60 | f = f[:-1] 61 | num_objs = len(f) 62 | 63 | for j in range(num_objs): 64 | f[j] = f[j].split(",") 65 | xmin = float(f[j][0]) 66 | ymin = float(f[j][1]) 67 | w = float(f[j][2]) 68 | h = float(f[j][3]) 69 | 70 | box.append([xmin,ymin,xmin+h,ymin+h]) 71 | labels.append(int(f[j][5])) 72 | 73 | self.boxes.append(torch.Tensor(box)) 74 | self.labels.append(torch.LongTensor(labels)) 75 | 76 | 77 | def __getitem__(self, idx): 78 | '''Load a image, and encode its bbox locations and class labels. 79 | Args: 80 | idx: (int) image index. 81 | Returns: 82 | img: (tensor) image tensor. 83 | loc_target: (tensor) location targets, sized [8732,4]. 84 | conf_target: (tensor) label targets, sized [8732,]. 85 | ''' 86 | # Load image and bbox locations. 87 | fname = self.fnames[idx] 88 | img = cv2.imread(os.path.join(self.root, fname[:-4]+".jpg")) 89 | boxes = self.boxes[idx].clone() 90 | labels = self.labels[idx] 91 | 92 | # Data augmentation while training. 93 | if self.train: 94 | img, boxes = self.random_flip(img, boxes) 95 | img, boxes, labels = self.random_crop(img, boxes, labels) 96 | 97 | # Scale bbox locaitons to [0,1]. 98 | w,h = img.shape[1], img.shape[0] 99 | boxes /= torch.Tensor([w,h,w,h]).expand_as(boxes) 100 | img = cv2.resize(img, (self.img_size,self.img_size)) 101 | img = self.transform(img) 102 | 103 | # Encode loc & conf targets. 104 | 105 | loc_target, conf_target = self.data_encoder.encode(boxes, labels) 106 | return img, loc_target, conf_target 107 | 108 | def random_flip(self, img, boxes): 109 | '''Randomly flip the image and adjust the bbox locations. 110 | For bbox (xmin, ymin, xmax, ymax), the flipped bbox is: 111 | (w-xmax, ymin, w-xmin, ymax). 112 | Args: 113 | img: (ndarray.Image) image. f 114 | boxes: (tensor) bbox locations, sized [#obj, 4]. 115 | Returns: 116 | img: (ndarray.Image) randomly flipped image. 117 | boxes: (tensor) randomly flipped bbox locations, sized [#obj, 4]. 118 | ''' 119 | if random.random() < 0.5: 120 | img = cv2.flip(img, 1) 121 | w = img.shape[1] 122 | xmin = w - boxes[:,2] 123 | xmax = w - boxes[:,0] 124 | boxes[:,0] = xmin 125 | boxes[:,2] = xmax 126 | return img, boxes 127 | 128 | def random_crop(self, img, boxes, labels): 129 | '''Randomly crop the image and adjust the bbox locations. 130 | For more details, see 'Chapter2.2: Data augmentation' of the paper. 131 | Args: 132 | img: (ndarray.Image) image. 133 | boxes: (tensor) bbox locations, sized [#obj, 4]. 134 | labels: (tensor) bbox labels, sized [#obj,]. 135 | Returns: 136 | img: (ndarray.Image) cropped image. 137 | selected_boxes: (tensor) selected bbox locations. 138 | labels: (tensor) selected bbox labels. 139 | ''' 140 | imw, imh = img.shape[1], img.shape[0] 141 | while True: 142 | min_iou = random.choice([None, 0.1, 0.3, 0.5, 0.7, 0.9])# random choice the one 143 | if min_iou is None: 144 | return img, boxes, labels 145 | 146 | for _ in range(100): 147 | w = random.randrange(int(0.1*imw), imw) 148 | h = random.randrange(int(0.1*imh), imh) 149 | 150 | if h > 2*w or w > 2*h or h < 1 or w < 1: 151 | continue 152 | 153 | x = random.randrange(imw - w) 154 | y = random.randrange(imh - h) 155 | roi = torch.Tensor([[x, y, x+w, y+h]]) 156 | 157 | center = (boxes[:,:2] + boxes[:,2:]) / 2 # [N,2] 158 | roi2 = roi.expand(len(center), 4) # [N,4] 159 | 160 | mask = (center > roi2[:,:2]) & (center < roi2[:,2:]) # [N,2] 161 | mask = mask[:,0] & mask[:,1] #[N,] 162 | 163 | if not mask.any(): 164 | continue 165 | 166 | selected_boxes = boxes.index_select(0, mask.nonzero().squeeze(1)) 167 | 168 | iou = self.data_encoder.iou(selected_boxes, roi) 169 | if iou.min() < min_iou: 170 | continue 171 | img = img[y:y+h, x:x+w, :] 172 | 173 | selected_boxes[:,0].add_(-x).clamp_(min=0, max=w) 174 | selected_boxes[:,1].add_(-y).clamp_(min=0, max=h) 175 | selected_boxes[:,2].add_(-x).clamp_(min=0, max=w) 176 | selected_boxes[:,3].add_(-y).clamp_(min=0, max=h) 177 | 178 | return img, selected_boxes, labels[mask] 179 | 180 | def __len__(self): 181 | return self.num_samples -------------------------------------------------------------------------------- /MDSSD_300/scripts/encoder.py: -------------------------------------------------------------------------------- 1 | '''Encode target locations and labels.''' 2 | import torch 3 | 4 | import math 5 | import itertools 6 | 7 | class DataEncoder: 8 | def __init__(self): 9 | '''Compute default box sizes with scale and aspect transform.''' 10 | scale = 300. 11 | steps = [s / scale for s in (32, 64, 100, 300, 4, 8, 16)] 12 | sizes_ssd = [s / scale for s in (111, 162, 213, 264, 315)] 13 | sizes_fusion = [s / scale for s in (15, 30, 60, 111)] 14 | aspect_ratios = ((2,3), (2,3), (2,), (2,), (2,), (2,), (2,)) 15 | feature_map_sizes = (10, 5, 3, 1, 75, 38, 19) 16 | num_layers = len(feature_map_sizes) 17 | 18 | boxes = [] 19 | for i in range(num_layers): 20 | fmsize = feature_map_sizes[i] # feature map size 21 | for h,w in itertools.product(range(fmsize), repeat=2): 22 | cx = (w + 0.5)*steps[i] 23 | cy = (h + 0.5)*steps[i] 24 | 25 | if i<4: 26 | s = sizes_ssd[i] 27 | boxes.append((cx, cy, s, s)) 28 | 29 | s = math.sqrt(sizes_ssd[i] * sizes_ssd[i+1]) 30 | boxes.append((cx, cy, s, s)) 31 | 32 | s = sizes_ssd[i] 33 | for ar in aspect_ratios[i]: 34 | boxes.append((cx, cy, s * math.sqrt(ar), s / math.sqrt(ar))) 35 | boxes.append((cx, cy, s / math.sqrt(ar), s * math.sqrt(ar))) 36 | 37 | else: 38 | s = sizes_fusion[i-4] 39 | boxes.append((cx, cy, s, s)) 40 | 41 | s = math.sqrt(sizes_fusion[i-4] * sizes_fusion[i-4+1]) 42 | boxes.append((cx, cy, s, s)) 43 | 44 | s = sizes_fusion[i-4] 45 | for ar in aspect_ratios[i]: 46 | boxes.append((cx, cy, s * math.sqrt(ar), s / math.sqrt(ar))) 47 | boxes.append((cx, cy, s / math.sqrt(ar), s * math.sqrt(ar))) 48 | 49 | self.default_boxes = torch.Tensor(boxes) 50 | 51 | def iou(self, box1, box2): 52 | '''Compute the intersection over union of two set of boxes, each box is [x1,y1,x2,y2]. 53 | 54 | Args: 55 | box1: (tensor) bounding boxes, sized [N,4]. 56 | box2: (tensor) bounding boxes, sized [M,4]. 57 | 58 | Return: 59 | (tensor) iou, sized [N,M]. 60 | ''' 61 | N = box1.size(0) 62 | M = box2.size(0) 63 | 64 | lt = torch.max( 65 | box1[:,:2].unsqueeze(1).expand(N,M,2), # [N,2] -> [N,1,2] -> [N,M,2] 66 | box2[:,:2].unsqueeze(0).expand(N,M,2), # [M,2] -> [1,M,2] -> [N,M,2] 67 | ) 68 | 69 | rb = torch.min( 70 | box1[:,2:].unsqueeze(1).expand(N,M,2), # [N,2] -> [N,1,2] -> [N,M,2] 71 | box2[:,2:].unsqueeze(0).expand(N,M,2), # [M,2] -> [1,M,2] -> [N,M,2] 72 | ) 73 | 74 | wh = rb - lt # [N,M,2] 75 | wh[wh<0] = 0 # clip at 0 76 | inter = wh[:,:,0] * wh[:,:,1] # [N,M] 77 | 78 | area1 = (box1[:,2]-box1[:,0]) * (box1[:,3]-box1[:,1]) # [N,] 79 | area2 = (box2[:,2]-box2[:,0]) * (box2[:,3]-box2[:,1]) # [M,] 80 | area1 = area1.unsqueeze(1).expand_as(inter) # [N,] -> [N,1] -> [N,M] 81 | area2 = area2.unsqueeze(0).expand_as(inter) # [M,] -> [1,M] -> [N,M] 82 | 83 | iou = inter / (area1 + area2 - inter) 84 | return iou 85 | 86 | def encode(self, boxes, classes, threshold=0.5): 87 | '''Transform target bounding boxes and class labels to SSD boxes and classes. 88 | 89 | Match each object box to all the default boxes, pick the ones with the 90 | Jaccard-Index > 0.5: 91 | Jaccard(A,B) = AB / (A+B-AB) 92 | 93 | Args: 94 | boxes: (tensor) object bounding boxes (xmin,ymin,xmax,ymax) of a image, sized [#obj, 4]. 95 | classes: (tensor) object class labels of a image, sized [#obj,]. 96 | threshold: (float) Jaccard index threshold 97 | 98 | Returns: 99 | boxes: (tensor) bounding boxes, sized [#obj, 8732, 4]. 100 | classes: (tensor) class labels, sized [8732,] 101 | ''' 102 | default_boxes = self.default_boxes 103 | num_default_boxes = default_boxes.size(0) 104 | num_objs = boxes.size(0) 105 | 106 | iou = self.iou( # [#obj,8732] 107 | boxes, 108 | torch.cat([default_boxes[:,:2] - default_boxes[:,2:]/2, 109 | default_boxes[:,:2] + default_boxes[:,2:]/2], 1) 110 | ) 111 | 112 | iou, max_idx = iou.max(0) # [1,8732] 113 | max_idx.squeeze_(0) # [8732,] 114 | iou.squeeze_(0) # [8732,] 115 | 116 | boxes = boxes[max_idx] # [8732,4] 117 | variances = [0.1, 0.2] 118 | cxcy = (boxes[:,:2] + boxes[:,2:])/2 - default_boxes[:,:2] # [8732,2] 119 | cxcy /= variances[0] * default_boxes[:,2:] 120 | wh = (boxes[:,2:] - boxes[:,:2]) / default_boxes[:,2:] # [8732,2] 121 | wh = torch.log(wh) / variances[1] 122 | loc = torch.cat([cxcy, wh], 1) # [8732,4] 123 | 124 | conf = 1 + classes[max_idx] # [8732,], background class = 0 125 | conf[iou 0: 153 | try: 154 | i = order[0] 155 | except: 156 | i = order 157 | keep.append(i) 158 | 159 | if order.size == 1: 160 | break 161 | 162 | xx1 = x1[order[1:]].clamp(min=x1[i]) 163 | yy1 = y1[order[1:]].clamp(min=y1[i]) 164 | xx2 = x2[order[1:]].clamp(max=x2[i]) 165 | yy2 = y2[order[1:]].clamp(max=y2[i]) 166 | 167 | w = (xx2-xx1).clamp(min=0) 168 | h = (yy2-yy1).clamp(min=0) 169 | inter = w*h 170 | 171 | if mode == 'union': 172 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 173 | elif mode == 'min': 174 | ovr = inter / areas[order[1:]].clamp(max=areas[i]) 175 | else: 176 | raise TypeError('Unknown nms mode: %s.' % mode) 177 | 178 | ids = (ovr<=threshold).nonzero().squeeze() 179 | if ids.size == 0: 180 | break 181 | order = order[ids+1] 182 | return torch.LongTensor(keep) 183 | 184 | def decode(self, loc, conf): 185 | '''Transform predicted loc/conf back to real bbox locations and class labels. 186 | 187 | Args: 188 | loc: (tensor) predicted loc, sized [8732,4]. 189 | conf: (tensor) predicted conf, sized [8732,21]. 190 | 191 | Returns: 192 | boxes: (tensor) bbox locations, sized [#obj, 4]. 193 | labels: (tensor) class labels, sized [#obj,1]. 194 | ''' 195 | variances = (0.1, 0.2) 196 | wh = torch.exp(loc[:,2:]*variances[1]) * self.default_boxes[:,2:] 197 | cxcy = loc[:,:2] * variances[0] * self.default_boxes[:,2:] + self.default_boxes[:,:2] 198 | box_preds = torch.cat([cxcy-wh/2, cxcy+wh/2], 1) # [8732,4] 199 | 200 | boxes = [] 201 | labels = [] 202 | scores = [] 203 | num_classes = conf.size(1) 204 | for i in range(num_classes-1): 205 | score = conf[:,i+1] # class i corresponds to (i+1) column 206 | mask = score > 0.1 207 | 208 | if not mask.any(): 209 | continue 210 | 211 | box = box_preds[mask.nonzero().squeeze()] 212 | score = score[mask] 213 | 214 | if len(score) == 1: 215 | continue 216 | keep = self.nms(box, score, threshold=0.3) 217 | boxes.append(box[keep]) 218 | labels.append(torch.LongTensor(len(box[keep])).fill_(i)) 219 | scores.append(score[keep]) 220 | 221 | return boxes, labels, scores 222 | -------------------------------------------------------------------------------- /MDSSD_300/scripts/fusion.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.nn.init as init 5 | 6 | from torch.autograd import Variable 7 | 8 | from norm import L2Norm 9 | 10 | class FusionBlock(nn.Module): 11 | def __init__(self, big_features, small_features): 12 | super(FusionBlock, self).__init__() 13 | 14 | # Bigger feature map 15 | self.conv1_1 = nn.Conv2d(big_features, 256, kernel_size=3, padding=1, dilation=1) 16 | self.Norm1 = L2Norm(256, 20) 17 | 18 | # Smaller feature map 19 | self.deconv2_1 = nn.ConvTranspose2d(small_features, 256, 2, stride=2, dilation=1) 20 | self.conv2_1 = nn.Conv2d(256, 256, kernel_size=3, padding=1, dilation=1) 21 | self.deconv2_2 = nn.ConvTranspose2d(256, 256, 2, stride=2, dilation=1) 22 | self.conv2_2 = nn.Conv2d(256, 256, kernel_size=3, padding=1, dilation=1) 23 | self.deconv2_3 = nn.ConvTranspose2d(256, 256, 3, stride=2, dilation=1) 24 | self.conv2_3 = nn.Conv2d(256, 256, kernel_size=3, padding=1, dilation=1) 25 | self.Norm2 = L2Norm(256, 20) 26 | 27 | # Common 28 | self.conv3_1 = nn.Conv2d(256, big_features, kernel_size=3, padding=1, dilation=1) 29 | 30 | 31 | def forward(self, big, small): 32 | h1 = self.conv1_1(big) 33 | h1 = self.Norm1(h1) 34 | 35 | h2 = self.deconv2_1(small) 36 | # print(h2.size()) 37 | h2 = F.relu(self.conv2_1(h2)) 38 | # print(h2.size()) 39 | h2 = self.deconv2_2(h2) 40 | # print(h2.size()) 41 | h2 = F.relu(self.conv2_2(h2)) 42 | # print(h2.size()) 43 | h2 = self.deconv2_3(h2) 44 | # print(h2.size()) 45 | h2 = self.conv2_3(h2) 46 | # print(h2.size()) 47 | h2 = self.Norm2(h2) 48 | 49 | size = h2.size()[3] 50 | diff_odd = h2.size()[-1] - h1.size()[-1] 51 | h2 = h2[:,:,(int(diff_odd/2)+diff_odd%2):(size-int(diff_odd/2)),(int(diff_odd/2)+diff_odd%2):(size-int(diff_odd/2))] 52 | 53 | # print(h1.size(), h2.size()) 54 | h = F.relu(h1+h2) 55 | h = F.relu(self.conv3_1(h)) 56 | 57 | return h 58 | 59 | if __name__ == '__main__': 60 | big = torch.randn(1, 256, 128, 128) 61 | small = torch.rand(1,512,16,16) 62 | net = FusionBlock(256,512) 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /MDSSD_300/scripts/mdssd.py: -------------------------------------------------------------------------------- 1 | import math 2 | import itertools 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torch.nn.init as init 8 | 9 | from torch.autograd import Variable 10 | 11 | from multibox_layer import MultiBoxLayer 12 | from fusion import FusionBlock 13 | from norm import L2Norm 14 | 15 | class MDSSD300(nn.Module): 16 | input_size = 300 17 | 18 | def __init__(self): 19 | super(MDSSD300, self).__init__() 20 | 21 | # model 22 | self.base = self.VGG16() 23 | self.norm4 = L2Norm(512, 20) # 38 24 | 25 | self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1) 26 | self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1) 27 | self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1) 28 | 29 | self.conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) 30 | 31 | self.conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 32 | 33 | self.conv8_1 = nn.Conv2d(1024, 256, kernel_size=1) 34 | self.conv8_2 = nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2) 35 | 36 | self.conv9_1 = nn.Conv2d(512, 128, kernel_size=1) 37 | self.conv9_2 = nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2) 38 | 39 | self.conv10_1 = nn.Conv2d(256, 128, kernel_size=1) 40 | self.conv10_2 = nn.Conv2d(128, 256, kernel_size=3) 41 | 42 | self.conv11_1 = nn.Conv2d(256, 128, kernel_size=1) 43 | self.conv11_2 = nn.Conv2d(128, 256, kernel_size=3) 44 | 45 | self.Fusion1 = FusionBlock(256,512) 46 | self.Fusion2 = FusionBlock(512,256) 47 | self.Fusion3 = FusionBlock(1024,256) 48 | 49 | # multibox layer 50 | self.multibox = MultiBoxLayer() 51 | 52 | def forward(self, x): 53 | odd_count = 0 54 | odd = [] 55 | hs = [] 56 | vgg = [] 57 | fusion_layers = [] 58 | h = self.base[0](x) 59 | vgg.append(h) 60 | for i in range(1,len(self.base)): 61 | h = self.base[i](h) 62 | vgg.append(h) 63 | fusion_layers.append(vgg[15]) 64 | odd.append(2) 65 | odd_count = 3 66 | fusion_layers.append(h) 67 | h = F.max_pool2d(h, kernel_size=2, stride=2, ceil_mode=True) 68 | 69 | h = F.relu(self.conv5_1(h)) 70 | h = F.relu(self.conv5_2(h)) 71 | h = F.relu(self.conv5_3(h)) 72 | h = F.max_pool2d(h, kernel_size=3, padding=1, stride=1, ceil_mode=True) 73 | 74 | h = F.relu(self.conv6(h)) 75 | h = F.relu(self.conv7(h)) 76 | fusion_layers.append(h) 77 | 78 | h = F.relu(self.conv8_1(h)) 79 | h = F.relu(self.conv8_2(h)) 80 | hs.append(h) # conv8_2 81 | 82 | h = F.relu(self.conv9_1(h)) 83 | h = F.relu(self.conv9_2(h)) 84 | hs.append(h) # conv9_2 85 | 86 | h = F.relu(self.conv10_1(h)) 87 | h = F.relu(self.conv10_2(h)) 88 | hs.append(h) # conv10_2 89 | 90 | h = F.relu(self.conv11_1(h)) 91 | h = F.relu(self.conv11_2(h)) 92 | hs.append(h) # conv11_2 93 | 94 | # Fusion Blocks 95 | f = self.Fusion1(fusion_layers[0],hs[-4]) 96 | hs.append(f) 97 | f = self.Fusion2(fusion_layers[1],hs[-4]) 98 | hs.append(f) 99 | diff_odd = fusion_layers[2].size()[-1] - hs[-4].size()[-1] 100 | f = self.Fusion3(fusion_layers[2],hs[-4]) 101 | hs.append(f) 102 | 103 | loc_preds, conf_preds = self.multibox(hs) 104 | 105 | return loc_preds, conf_preds 106 | 107 | def VGG16(self): 108 | '''VGG16 layers.''' 109 | cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512] 110 | layers = [] 111 | in_channels = 3 112 | for x in cfg: 113 | if x == 'M': 114 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 115 | else: 116 | layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1), 117 | nn.ReLU(True)] 118 | in_channels = x 119 | return nn.Sequential(*layers) 120 | 121 | if __name__ == '__main__': 122 | t = torch.randn(1, 3, 300, 300) 123 | net = MDSSD300() 124 | # print(net) 125 | res = net.forward(t) 126 | -------------------------------------------------------------------------------- /MDSSD_300/scripts/multibox_layer.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import math 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.init as init 8 | import torch.nn.functional as F 9 | 10 | 11 | class MultiBoxLayer(nn.Module): 12 | num_classes = 13 13 | num_anchors = [6,6,4,4,4,4,4] 14 | 15 | def __init__(self): 16 | super(MultiBoxLayer, self).__init__() 17 | self.in_planes = [512,256,256,256,256,512,1024] 18 | self.loc_layers = nn.ModuleList() 19 | self.conf_layers = nn.ModuleList() 20 | for i in range(len(self.in_planes)): 21 | self.loc_layers.append(nn.Conv2d(self.in_planes[i], self.num_anchors[i]*4, kernel_size=3, padding=1)) 22 | self.conf_layers.append(nn.Conv2d(self.in_planes[i], self.num_anchors[i]*13, kernel_size=3, padding=1)) 23 | 24 | def forward(self, xs): 25 | ''' 26 | Args: 27 | xs: (list) of tensor containing intermediate layer outputs. 28 | 29 | Returns: 30 | loc_preds: (tensor) predicted locations, sized [N,8732,4]. 31 | conf_preds: (tensor) predicted class confidences, sized [N,8732,21]. 32 | ''' 33 | y_locs = [] 34 | y_confs = [] 35 | for i, x in enumerate(xs): 36 | y_loc = self.loc_layers[i](x) 37 | N = y_loc.size(0) 38 | y_loc = y_loc.permute(0,2,3,1).contiguous() 39 | y_loc = y_loc.view(N,-1,4) 40 | y_locs.append(y_loc) 41 | 42 | y_conf = self.conf_layers[i](x) 43 | y_conf = y_conf.permute(0,2,3,1).contiguous() 44 | y_conf = y_conf.view(N,-1,13) 45 | y_confs.append(y_conf) 46 | 47 | loc_preds = torch.cat(y_locs, 1) 48 | conf_preds = torch.cat(y_confs, 1) 49 | return loc_preds, conf_preds 50 | -------------------------------------------------------------------------------- /MDSSD_300/scripts/multibox_loss.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import math 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.init as init 8 | import torch.nn.functional as F 9 | 10 | 11 | class MultiBoxLoss(nn.Module): 12 | num_classes = 13 13 | 14 | def __init__(self): 15 | super(MultiBoxLoss, self).__init__() 16 | 17 | def cross_entropy_loss(self, x, y): 18 | '''Cross entropy loss w/o averaging across all samples. 19 | 20 | Args: 21 | x: (tensor) sized [N,D]. 22 | y: (tensor) sized [N,]. 23 | 24 | Return: 25 | (tensor) cross entroy loss, sized [N,]. 26 | ''' 27 | xmax = x.data.max() 28 | print('x y size {} {}'.format(x.size(), y.size())) 29 | log_sum_exp = torch.log(torch.sum(torch.exp(x-xmax), 1)) + xmax 30 | print('log_sum_exp {}'.format(log_sum_exp.size())) 31 | return log_sum_exp - x.gather(1, y.view(-1,1)) 32 | 33 | def hard_negative_mining(self, conf_loss, pos): 34 | '''Return negative indices that is 3x the number as postive indices. 35 | 36 | Args: 37 | conf_loss: (tensor) cross entroy loss between conf_preds and conf_targets, sized [N*8732,]. 38 | pos: (tensor) positive(matched) box indices, sized [N,8732]. 39 | 40 | Return: 41 | (tensor) negative indices, sized [N,8732]. 42 | ''' 43 | batch_size, num_boxes = pos.size() 44 | conf_loss[pos.view(-1)] = 0 # set pos boxes = 0, the rest are neg conf_loss 45 | conf_loss = conf_loss.view(batch_size, -1) # [N,8732] 46 | 47 | _,idx = conf_loss.sort(1, descending=True) # sort by neg conf_loss 48 | _,rank = idx.sort(1) # [N,8732] 49 | 50 | num_pos = pos.long().sum(1) # [N,1] 51 | num_neg = torch.clamp(3*num_pos, max=num_boxes-1) # [N,1] 52 | 53 | neg = rank < num_neg.unsqueeze(1).expand_as(rank) 54 | 55 | return neg 56 | 57 | def forward(self, loc_preds, loc_targets, conf_preds, conf_targets): 58 | '''Compute loss between (loc_preds, loc_targets) and (conf_preds, conf_targets). 59 | 60 | Args: 61 | loc_preds: (tensor) predicted locations, sized [batch_size, 8732, 4]. 62 | loc_targets: (tensor) encoded target locations, sized [batch_size, 8732, 4]. 63 | conf_preds: (tensor) predicted class confidences, sized [batch_size, 8732, num_classes]. 64 | conf_targets: (tensor) encoded target classes, sized [batch_size, 8732]. 65 | 66 | loss: 67 | (tensor) loss = SmoothL1Loss(loc_preds, loc_targets) + CrossEntropyLoss(conf_preds, conf_targets). 68 | ''' 69 | 70 | # loc_preds = loc_preds[:,:8732,:] 71 | # conf_preds = conf_preds[:,:8732,:] 72 | 73 | batch_size, num_boxes, _ = loc_preds.size() 74 | pos = conf_targets > 0 # [N,8732], pos means the box matched. 75 | # print(pos.size()) 76 | num_matched_boxes = pos.data.float().sum() 77 | if num_matched_boxes == 0: 78 | return torch.tensor([0.], requires_grad=True) 79 | 80 | ################################################################ 81 | # loc_loss = SmoothL1Loss(pos_loc_preds, pos_loc_targets) 82 | ################################################################ 83 | pos_mask = pos.unsqueeze(2).expand_as(loc_preds) # [N,8732,4] 84 | pos_loc_preds = loc_preds[pos_mask].view(-1,4) # [#pos,4] 85 | pos_loc_targets = loc_targets[pos_mask].view(-1,4) # [#pos,4] 86 | 87 | loc_loss = F.smooth_l1_loss(pos_loc_preds, pos_loc_targets, size_average=False) 88 | 89 | ################################################################ 90 | # conf_loss = CrossEntropyLoss(pos_conf_preds, pos_conf_targets) 91 | # + CrossEntropyLoss(neg_conf_preds, neg_conf_targets) 92 | ################################################################ 93 | conf_preds = conf_preds.contiguous() 94 | # print(conf_preds.size(), conf_targets.size()) 95 | conf_loss = F.cross_entropy(conf_preds.view(-1,self.num_classes), \ 96 | conf_targets.view(-1), reduce=False) # [N*8732,] 97 | neg = self.hard_negative_mining(conf_loss, pos) # [N,8732] 98 | 99 | pos_mask = pos.unsqueeze(2).expand_as(conf_preds) # [N,8732,21] 100 | neg_mask = neg.unsqueeze(2).expand_as(conf_preds) # [N,8732,21] 101 | mask = (pos_mask+neg_mask).gt(0) 102 | 103 | pos_and_neg = (pos+neg).gt(0) 104 | preds = conf_preds[mask].view(-1,self.num_classes) # [#pos+#neg,21] 105 | targets = conf_targets[pos_and_neg] # [#pos+#neg,] 106 | conf_loss = F.cross_entropy(preds, targets, size_average=False) 107 | 108 | loc_loss /= num_matched_boxes 109 | conf_loss /= num_matched_boxes 110 | 111 | return loc_loss + conf_loss 112 | -------------------------------------------------------------------------------- /MDSSD_300/scripts/norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.nn.init as init 5 | 6 | class L2Norm(nn.Module): 7 | '''L2Norm layer across all channels and scale.''' 8 | def __init__(self, in_features,scale): 9 | super(L2Norm, self).__init__() 10 | self.weight = nn.Parameter(torch.Tensor(in_features)) 11 | self.reset_parameters(scale) 12 | 13 | def reset_parameters(self, scale): 14 | nn.init.constant_(self.weight, scale) 15 | 16 | def forward(self, x): 17 | x = F.normalize(x, dim=1) 18 | scale = self.weight[None,:,None,None] 19 | return scale * x -------------------------------------------------------------------------------- /MDSSD_300/scripts/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | import torch.nn.functional as F 4 | import torchvision.transforms as transforms 5 | 6 | import sys 7 | from mdssd import MDSSD300 8 | from encoder import DataEncoder 9 | import cv2 10 | 11 | VOC_LABELS = ( 12 | 'ignored regions', 13 | 'pedestrian', 14 | 'people', 15 | 'bicycle', 16 | 'car', 17 | 'van', 18 | 'truck', 19 | 'tricycle', 20 | 'awning-tricycle', 21 | 'bus', 22 | 'motor', 23 | 'other' 24 | ) 25 | 26 | 27 | # Load model 28 | net = MDSSD300() 29 | checkpoint = torch.load('./checkpoint/ckpt.pth') 30 | 31 | keys = [] 32 | for k,v in checkpoint['net'].items(): 33 | if "module" in k: 34 | keys.append(k) 35 | for i in keys: 36 | checkpoint['net'][i[7:]] = checkpoint['net'][i] 37 | del checkpoint['net'][i] 38 | 39 | net.load_state_dict(checkpoint['net']) 40 | net.eval() 41 | 42 | if len(sys.argv) == 2: 43 | img_path = sys.argv[1] 44 | else: 45 | img_path = './images/img1.jpg' 46 | 47 | # Load test image 48 | img = cv2.imread(img_path) 49 | img1 = cv2.resize(img, (300, 300)) 50 | transform = transforms.Compose([transforms.ToTensor(), 51 | transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))]) 52 | img1 = transform(img1) 53 | 54 | # Forward 55 | with torch.no_grad(): 56 | x = torch.tensor(img1) 57 | loc_preds, conf = net(x.unsqueeze(0)) 58 | # Decode 59 | data_encoder = DataEncoder() 60 | boxes, labels, scores = data_encoder.decode(loc_preds.data.squeeze(0), F.softmax(conf.squeeze(0), dim=1).data) 61 | for box, label, score in zip(boxes, labels, scores): 62 | box[::2] *= img.shape[1] 63 | box[1::2] *= img.shape[0] 64 | for b, s in zip(box, score): 65 | if s > 0.25: 66 | print('label:',VOC_LABELS[int(label[0])], 'score:', score) 67 | b = list(b) 68 | cv2.rectangle(img, (b[0], b[1]), (b[2], b[3]), (255, 255, 255), 2) 69 | title = '{}: {}'.format(VOC_LABELS[int(label[0])], round(float(score[0]), 2)) 70 | cv2.putText(img, title, (b[0], b[1]), cv2.FONT_ITALIC, 0.6, (0, 255, 0), 2) 71 | cv2.imshow('img', img) 72 | cv2.waitKey(0) -------------------------------------------------------------------------------- /MDSSD_300/scripts/train.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | import argparse 5 | import itertools 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.optim as optim 10 | import torch.nn.functional as F 11 | import torch.backends.cudnn as cudnn 12 | 13 | import torchvision 14 | import torchvision.transforms as transforms 15 | 16 | import numpy as np 17 | from mdssd import MDSSD300 18 | from datagen import ListDataset 19 | from multibox_loss import MultiBoxLoss 20 | 21 | lr = 0.001 22 | resume = False # Resume from checkpoint 23 | epoch = 200 24 | batch_size = 8 25 | 26 | use_cuda = torch.cuda.is_available() 27 | best_loss = float('inf') # best test loss 28 | start_epoch = 0 # start from epoch 0 or last epoch 29 | 30 | # Data 31 | print('==> Preparing data..') 32 | transform = transforms.Compose([transforms.ToTensor(), 33 | transforms.Normalize(mean=(0.356, 0.368, 0.362), std=(0.242, 0.235, 0.236))]) 34 | 35 | trainset = ListDataset(root='../../../../VisDrone2019/dataset/VisDrone2018-DET-train/images/', list_file='../../../../VisDrone2019/dataset/VisDrone2018-DET-train/annotations/', train=True, transform=transform) 36 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=8, shuffle=True, num_workers=4) 37 | 38 | valset = ListDataset(root='../../../../VisDrone2019/dataset/VisDrone2019-DET-val/images/', list_file='../../../../VisDrone2019/dataset/VisDrone2019-DET-val/annotations/', train=True, transform=transform) 39 | valloader = torch.utils.data.DataLoader(valset, batch_size=8, shuffle=True, num_workers=4) 40 | 41 | # Model 42 | net = MDSSD300() 43 | if resume: 44 | print('==> Resuming from checkpoint..') 45 | checkpoint = torch.load('./checkpoint/ckpt.pth') 46 | 47 | keys = [] 48 | for k,v in checkpoint['net'].items(): 49 | if "module" in k: 50 | keys.append(k) 51 | for i in keys: 52 | checkpoint['net'][i[7:]] = checkpoint['net'][i] 53 | del checkpoint['net'][i] 54 | 55 | net.load_state_dict(checkpoint['net']) 56 | best_loss = checkpoint['loss'] 57 | start_epoch = checkpoint['epoch'] 58 | else: 59 | # Convert from pretrained VGG model. 60 | try: 61 | net.load_state_dict(torch.load('../model/ssd.pth')) 62 | print('==> Pretrain model read successfully') 63 | except: 64 | print('==> Pretrain model read failed or not existed, training from init') 65 | 66 | criterion = MultiBoxLoss() 67 | 68 | if use_cuda: 69 | net = torch.nn.DataParallel(net, device_ids=[0]) 70 | net.cuda() 71 | cudnn.benchmark = True 72 | 73 | optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4) 74 | # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.25) 75 | 76 | # Training 77 | def train(epoch,prev_val_loss, last_saved): 78 | print('\nEpoch: %d' % epoch) 79 | net.train() 80 | train_loss = 0 81 | for batch_idx, (images, loc_targets, conf_targets) in enumerate(trainloader): 82 | if use_cuda: 83 | images = images.cuda() 84 | loc_targets = loc_targets.cuda() 85 | conf_targets = conf_targets.cuda() 86 | 87 | images = torch.tensor(images) 88 | loc_targets = torch.tensor(loc_targets) 89 | conf_targets = torch.tensor(conf_targets) 90 | 91 | optimizer.zero_grad() 92 | loc_preds, conf_preds = net(images) 93 | loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets) 94 | loss.backward() 95 | optimizer.step() 96 | # scheduler.step() 97 | 98 | train_loss += loss.item() 99 | if batch_idx%100 == 0: 100 | val_loss_tot = 0 101 | for batch_idx_val, (images, loc_targets, conf_targets) in enumerate(valloader): 102 | if use_cuda: 103 | images = images.cuda() 104 | loc_targets = loc_targets.cuda() 105 | conf_targets = conf_targets.cuda() 106 | 107 | images = torch.tensor(images) 108 | loc_targets = torch.tensor(loc_targets) 109 | conf_targets = torch.tensor(conf_targets) 110 | 111 | loc_preds, conf_preds = net(images) 112 | val_loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets) 113 | val_loss_tot += val_loss.item() 114 | 115 | val_loss_tot /= (batch_idx_val+1) 116 | if val_loss_tot < prev_val_loss: 117 | os.makedirs('checkpoint', exist_ok=True) 118 | torch.save({ 119 | 'epoch': epoch, 120 | 'net': net.state_dict(), 121 | 'loss': loss, 122 | }, 'checkpoint/ckpt.pth') 123 | print("Saved.") 124 | prev_val_loss = val_loss_tot 125 | last_saved = [epoch, batch_idx] 126 | 127 | print('epoch: {}, batch_idx: {},loss: {}, train_loss: {}, best_val_loss: {}, last_saved: {}'.format(epoch, batch_idx, loss.item(), train_loss/(batch_idx+1), prev_val_loss, last_saved)) 128 | 129 | return prev_val_loss, last_saved 130 | 131 | 132 | prev_val_loss = 999 133 | last_saved = [start_epoch,0] 134 | for epoch_num in range(start_epoch, start_epoch+epoch): 135 | prev_val_loss, last_saved = train(epoch_num, prev_val_loss, last_saved) 136 | -------------------------------------------------------------------------------- /MDSSD_512/scripts/datagen.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Load image/class/box from a annotation file. 3 | 4 | ''' 5 | from __future__ import print_function 6 | 7 | import os 8 | import sys 9 | import os.path 10 | 11 | import random 12 | import numpy as np 13 | 14 | import torch 15 | import torch.utils.data as data 16 | import torchvision.transforms as transforms 17 | 18 | from encoder import DataEncoder 19 | import cv2 20 | 21 | import pandas as pd 22 | import shutil 23 | import os 24 | import numpy as np 25 | import glob 26 | import xml.etree.ElementTree as ET 27 | from xml.dom import minidom 28 | 29 | class ListDataset(data.Dataset): 30 | img_size = 512 31 | 32 | def __init__(self, root, list_file, train, transform): 33 | ''' 34 | Args: 35 | root: (str) ditectory to images. 36 | list_file: (str) path to annotation files. 37 | train: (boolean) train or test. 38 | transform: ([transforms]) image transforms. 39 | ''' 40 | self.root = root 41 | self.train = train 42 | self.transform = transform 43 | 44 | self.fnames = [] 45 | self.boxes = [] 46 | self.labels = [] 47 | 48 | self.data_encoder = DataEncoder() 49 | self.num_samples = 0 50 | 51 | # VisDrone 52 | 53 | for i in os.listdir(list_file): 54 | self.num_samples += 1 55 | self.fnames.append(i) 56 | box = [] 57 | labels = [] 58 | with open(os.path.join(list_file,i)) as f: 59 | f = f.read().split("\n") 60 | f = f[:-1] 61 | num_objs = len(f) 62 | 63 | for j in range(num_objs): 64 | f[j] = f[j].split(",") 65 | xmin = float(f[j][0]) 66 | ymin = float(f[j][1]) 67 | w = float(f[j][2]) 68 | h = float(f[j][3]) 69 | 70 | box.append([xmin,ymin,xmin+w,ymin+h]) 71 | labels.append(int(f[j][5])) 72 | 73 | self.boxes.append(torch.Tensor(box)) 74 | self.labels.append(torch.LongTensor(labels)) 75 | 76 | 77 | def __getitem__(self, idx): 78 | '''Load a image, and encode its bbox locations and class labels. 79 | Args: 80 | idx: (int) image index. 81 | Returns: 82 | img: (tensor) image tensor. 83 | loc_target: (tensor) location targets, sized [8732,4]. 84 | conf_target: (tensor) label targets, sized [8732,]. 85 | ''' 86 | # Load image and bbox locations. 87 | fname = self.fnames[idx] 88 | img = cv2.imread(os.path.join(self.root, fname[:-4]+".jpg")) 89 | boxes = self.boxes[idx].clone() 90 | labels = self.labels[idx] 91 | 92 | # Data augmentation while training. 93 | if self.train: 94 | img, boxes = self.random_flip(img, boxes) 95 | img, boxes, labels = self.random_crop(img, boxes, labels) 96 | 97 | # Scale bbox locaitons to [0,1]. 98 | w,h = img.shape[1], img.shape[0] 99 | boxes /= torch.Tensor([w,h,w,h]).expand_as(boxes) 100 | img = cv2.resize(img, (self.img_size,self.img_size)) 101 | img = self.transform(img) 102 | 103 | # Encode loc & conf targets. 104 | 105 | loc_target, conf_target = self.data_encoder.encode(boxes, labels) 106 | return img, loc_target, conf_target 107 | 108 | def random_flip(self, img, boxes): 109 | '''Randomly flip the image and adjust the bbox locations. 110 | For bbox (xmin, ymin, xmax, ymax), the flipped bbox is: 111 | (w-xmax, ymin, w-xmin, ymax). 112 | Args: 113 | img: (ndarray.Image) image. f 114 | boxes: (tensor) bbox locations, sized [#obj, 4]. 115 | Returns: 116 | img: (ndarray.Image) randomly flipped image. 117 | boxes: (tensor) randomly flipped bbox locations, sized [#obj, 4]. 118 | ''' 119 | if random.random() < 0.5: 120 | img = cv2.flip(img, 1) 121 | w = img.shape[1] 122 | xmin = w - boxes[:,2] 123 | xmax = w - boxes[:,0] 124 | boxes[:,0] = xmin 125 | boxes[:,2] = xmax 126 | return img, boxes 127 | 128 | def random_crop(self, img, boxes, labels): 129 | '''Randomly crop the image and adjust the bbox locations. 130 | For more details, see 'Chapter2.2: Data augmentation' of the paper. 131 | Args: 132 | img: (ndarray.Image) image. 133 | boxes: (tensor) bbox locations, sized [#obj, 4]. 134 | labels: (tensor) bbox labels, sized [#obj,]. 135 | Returns: 136 | img: (ndarray.Image) cropped image. 137 | selected_boxes: (tensor) selected bbox locations. 138 | labels: (tensor) selected bbox labels. 139 | ''' 140 | imw, imh = img.shape[1], img.shape[0] 141 | while True: 142 | min_iou = random.choice([None, 0.1, 0.3, 0.5, 0.7, 0.9])# random choice the one 143 | if min_iou is None: 144 | return img, boxes, labels 145 | 146 | for _ in range(100): 147 | w = random.randrange(int(0.1*imw), imw) 148 | h = random.randrange(int(0.1*imh), imh) 149 | 150 | if h > 2*w or w > 2*h or h < 1 or w < 1: 151 | continue 152 | 153 | x = random.randrange(imw - w) 154 | y = random.randrange(imh - h) 155 | roi = torch.Tensor([[x, y, x+w, y+h]]) 156 | 157 | center = (boxes[:,:2] + boxes[:,2:]) / 2 # [N,2] 158 | roi2 = roi.expand(len(center), 4) # [N,4] 159 | 160 | mask = (center > roi2[:,:2]) & (center < roi2[:,2:]) # [N,2] 161 | mask = mask[:,0] & mask[:,1] #[N,] 162 | 163 | if not mask.any(): 164 | continue 165 | 166 | selected_boxes = boxes.index_select(0, mask.nonzero().squeeze(1)) 167 | 168 | iou = self.data_encoder.iou(selected_boxes, roi) 169 | if iou.min() < min_iou: 170 | continue 171 | img = img[y:y+h, x:x+w, :] 172 | 173 | selected_boxes[:,0].add_(-x).clamp_(min=0, max=w) 174 | selected_boxes[:,1].add_(-y).clamp_(min=0, max=h) 175 | selected_boxes[:,2].add_(-x).clamp_(min=0, max=w) 176 | selected_boxes[:,3].add_(-y).clamp_(min=0, max=h) 177 | 178 | return img, selected_boxes, labels[mask] 179 | 180 | def __len__(self): 181 | return self.num_samples -------------------------------------------------------------------------------- /MDSSD_512/scripts/encoder.py: -------------------------------------------------------------------------------- 1 | '''Encode target locations and labels.''' 2 | import torch 3 | 4 | import math 5 | import itertools 6 | 7 | class DataEncoder: 8 | def __init__(self): 9 | '''Compute default box sizes with scale and aspect transform.''' 10 | scale = 512. 11 | steps = [s / scale for s in (32, 64, 128, 256, 512, 4, 8, 16)] 12 | sizes_ssd = [s / scale for s in (115.0, 230.4, 307.2, 384.0, 460.8, 537.6)] 13 | sizes_fusion = [s / scale for s in (35.84, 76.8, 153.6, 230.4)] 14 | aspect_ratios = ((2,3), (2,3), (2,), (2,), (2,), (2,), (2,),(2,)) 15 | feature_map_sizes = (16,8,4,2,1,128,64,32) 16 | num_layers = len(feature_map_sizes) 17 | 18 | boxes = [] 19 | for i in range(num_layers): 20 | fmsize = feature_map_sizes[i] # feature map size 21 | for h,w in itertools.product(range(fmsize), repeat=2): 22 | cx = (w + 0.5)*steps[i] 23 | cy = (h + 0.5)*steps[i] 24 | 25 | if i<5: 26 | s = sizes_ssd[i] 27 | boxes.append((cx, cy, s, s)) 28 | 29 | s = math.sqrt(sizes_ssd[i] * sizes_ssd[i+1]) 30 | boxes.append((cx, cy, s, s)) 31 | 32 | s = sizes_ssd[i] 33 | for ar in aspect_ratios[i]: 34 | boxes.append((cx, cy, s * math.sqrt(ar), s / math.sqrt(ar))) 35 | boxes.append((cx, cy, s / math.sqrt(ar), s * math.sqrt(ar))) 36 | 37 | else: 38 | s = sizes_fusion[i-5] 39 | boxes.append((cx, cy, s, s)) 40 | 41 | s = math.sqrt(sizes_fusion[i-5] * sizes_fusion[i-5+1]) 42 | boxes.append((cx, cy, s, s)) 43 | 44 | s = sizes_fusion[i-5] 45 | for ar in aspect_ratios[i]: 46 | boxes.append((cx, cy, s * math.sqrt(ar), s / math.sqrt(ar))) 47 | boxes.append((cx, cy, s / math.sqrt(ar), s * math.sqrt(ar))) 48 | 49 | self.default_boxes = torch.Tensor(boxes) 50 | 51 | def iou(self, box1, box2): 52 | '''Compute the intersection over union of two set of boxes, each box is [x1,y1,x2,y2]. 53 | 54 | Args: 55 | box1: (tensor) bounding boxes, sized [N,4]. 56 | box2: (tensor) bounding boxes, sized [M,4]. 57 | 58 | Return: 59 | (tensor) iou, sized [N,M]. 60 | ''' 61 | N = box1.size(0) 62 | M = box2.size(0) 63 | 64 | lt = torch.max( 65 | box1[:,:2].unsqueeze(1).expand(N,M,2), # [N,2] -> [N,1,2] -> [N,M,2] 66 | box2[:,:2].unsqueeze(0).expand(N,M,2), # [M,2] -> [1,M,2] -> [N,M,2] 67 | ) 68 | 69 | rb = torch.min( 70 | box1[:,2:].unsqueeze(1).expand(N,M,2), # [N,2] -> [N,1,2] -> [N,M,2] 71 | box2[:,2:].unsqueeze(0).expand(N,M,2), # [M,2] -> [1,M,2] -> [N,M,2] 72 | ) 73 | 74 | wh = rb - lt # [N,M,2] 75 | wh[wh<0] = 0 # clip at 0 76 | inter = wh[:,:,0] * wh[:,:,1] # [N,M] 77 | 78 | area1 = (box1[:,2]-box1[:,0]) * (box1[:,3]-box1[:,1]) # [N,] 79 | area2 = (box2[:,2]-box2[:,0]) * (box2[:,3]-box2[:,1]) # [M,] 80 | area1 = area1.unsqueeze(1).expand_as(inter) # [N,] -> [N,1] -> [N,M] 81 | area2 = area2.unsqueeze(0).expand_as(inter) # [M,] -> [1,M] -> [N,M] 82 | 83 | iou = inter / (area1 + area2 - inter) 84 | return iou 85 | 86 | def encode(self, boxes, classes, threshold=0.5): 87 | '''Transform target bounding boxes and class labels to SSD boxes and classes. 88 | 89 | Match each object box to all the default boxes, pick the ones with the 90 | Jaccard-Index > 0.5: 91 | Jaccard(A,B) = AB / (A+B-AB) 92 | 93 | Args: 94 | boxes: (tensor) object bounding boxes (xmin,ymin,xmax,ymax) of a image, sized [#obj, 4]. 95 | classes: (tensor) object class labels of a image, sized [#obj,]. 96 | threshold: (float) Jaccard index threshold 97 | 98 | Returns: 99 | boxes: (tensor) bounding boxes, sized [#obj, 8732, 4]. 100 | classes: (tensor) class labels, sized [8732,] 101 | ''' 102 | default_boxes = self.default_boxes 103 | num_default_boxes = default_boxes.size(0) 104 | num_objs = boxes.size(0) 105 | 106 | iou = self.iou( # [#obj,8732] 107 | boxes, 108 | torch.cat([default_boxes[:,:2] - default_boxes[:,2:]/2, 109 | default_boxes[:,:2] + default_boxes[:,2:]/2], 1) 110 | ) 111 | 112 | iou, max_idx = iou.max(0) # [1,8732] 113 | max_idx.squeeze_(0) # [8732,] 114 | iou.squeeze_(0) # [8732,] 115 | 116 | boxes = boxes[max_idx] # [8732,4] 117 | variances = [0.1, 0.2] 118 | cxcy = (boxes[:,:2] + boxes[:,2:])/2 - default_boxes[:,:2] # [8732,2] 119 | cxcy /= variances[0] * default_boxes[:,2:] 120 | wh = (boxes[:,2:] - boxes[:,:2]) / default_boxes[:,2:] # [8732,2] 121 | wh = torch.log(wh) / variances[1] 122 | loc = torch.cat([cxcy, wh], 1) # [8732,4] 123 | 124 | conf = 1 + classes[max_idx] # [8732,], background class = 0 125 | conf[iou 0: 153 | try: 154 | i = order[0] 155 | except: 156 | i = order 157 | keep.append(i) 158 | 159 | if order.size == 1: 160 | break 161 | 162 | xx1 = x1[order[1:]].clamp(min=x1[i]) 163 | yy1 = y1[order[1:]].clamp(min=y1[i]) 164 | xx2 = x2[order[1:]].clamp(max=x2[i]) 165 | yy2 = y2[order[1:]].clamp(max=y2[i]) 166 | 167 | w = (xx2-xx1).clamp(min=0) 168 | h = (yy2-yy1).clamp(min=0) 169 | inter = w*h 170 | 171 | if mode == 'union': 172 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 173 | elif mode == 'min': 174 | ovr = inter / areas[order[1:]].clamp(max=areas[i]) 175 | else: 176 | raise TypeError('Unknown nms mode: %s.' % mode) 177 | 178 | ids = (ovr<=threshold).nonzero().squeeze() 179 | if ids.size == 0: 180 | break 181 | order = order[ids+1] 182 | return torch.LongTensor(keep) 183 | 184 | def decode(self, loc, conf): 185 | '''Transform predicted loc/conf back to real bbox locations and class labels. 186 | 187 | Args: 188 | loc: (tensor) predicted loc, sized [8732,4]. 189 | conf: (tensor) predicted conf, sized [8732,21]. 190 | 191 | Returns: 192 | boxes: (tensor) bbox locations, sized [#obj, 4]. 193 | labels: (tensor) class labels, sized [#obj,1]. 194 | ''' 195 | variances = (0.1, 0.2) 196 | wh = torch.exp(loc[:,2:]*variances[1]) * self.default_boxes[:,2:] 197 | cxcy = loc[:,:2] * variances[0] * self.default_boxes[:,2:] + self.default_boxes[:,:2] 198 | box_preds = torch.cat([cxcy-wh/2, cxcy+wh/2], 1) # [8732,4] 199 | 200 | boxes = [] 201 | labels = [] 202 | scores = [] 203 | num_classes = conf.size(1) 204 | for i in range(num_classes-1): 205 | score = conf[:,i+1] # class i corresponds to (i+1) column 206 | mask = score > 0.1 207 | 208 | if not mask.any(): 209 | continue 210 | 211 | box = box_preds[mask.nonzero().squeeze()] 212 | score = score[mask] 213 | 214 | if len(score) == 1: 215 | continue 216 | keep = self.nms(box, score, threshold=0.3) 217 | boxes.append(box[keep]) 218 | labels.append(torch.LongTensor(len(box[keep])).fill_(i)) 219 | scores.append(score[keep]) 220 | 221 | return boxes, labels, scores 222 | -------------------------------------------------------------------------------- /MDSSD_512/scripts/fusion.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.nn.init as init 5 | 6 | from torch.autograd import Variable 7 | 8 | from norm import L2Norm 9 | 10 | class FusionBlock(nn.Module): 11 | def __init__(self, big_features, small_features): 12 | super(FusionBlock, self).__init__() 13 | 14 | # Bigger feature map 15 | self.conv1_1 = nn.Conv2d(big_features, 256, kernel_size=3, padding=1, dilation=1) 16 | self.Norm1 = L2Norm(256, 20) 17 | 18 | # Smaller feature map 19 | self.deconv2_1 = nn.ConvTranspose2d(small_features, 256, 2, stride=2, dilation=1) 20 | self.conv2_1 = nn.Conv2d(256, 256, kernel_size=3, padding=1, dilation=1) 21 | self.deconv2_2 = nn.ConvTranspose2d(256, 256, 2, stride=2, dilation=1) 22 | self.conv2_2 = nn.Conv2d(256, 256, kernel_size=3, padding=1, dilation=1) 23 | self.deconv2_3 = nn.ConvTranspose2d(256, 256, 3, stride=2, dilation=1) 24 | self.conv2_3 = nn.Conv2d(256, 256, kernel_size=3, padding=1, dilation=1) 25 | self.Norm2 = L2Norm(256, 20) 26 | 27 | # Common 28 | self.conv3_1 = nn.Conv2d(256, big_features, kernel_size=3, padding=1, dilation=1) 29 | 30 | 31 | def forward(self, big, small): 32 | h1 = self.conv1_1(big) 33 | h1 = self.Norm1(h1) 34 | 35 | h2 = self.deconv2_1(small) 36 | # print(h2.size()) 37 | h2 = F.relu(self.conv2_1(h2)) 38 | # print(h2.size()) 39 | h2 = self.deconv2_2(h2) 40 | # print(h2.size()) 41 | h2 = F.relu(self.conv2_2(h2)) 42 | # print(h2.size()) 43 | h2 = self.deconv2_3(h2) 44 | # print(h2.size()) 45 | h2 = self.conv2_3(h2) 46 | # print(h2.size()) 47 | h2 = self.Norm2(h2) 48 | 49 | size = h2.size()[3] 50 | diff_odd = h2.size()[-1] - h1.size()[-1] 51 | h2 = h2[:,:,(int(diff_odd/2)+diff_odd%2):(size-int(diff_odd/2)),(int(diff_odd/2)+diff_odd%2):(size-int(diff_odd/2))] 52 | 53 | # print(h1.size(), h2.size()) 54 | h = F.relu(h1+h2) 55 | h = F.relu(self.conv3_1(h)) 56 | 57 | return h 58 | 59 | if __name__ == '__main__': 60 | big = torch.randn(1, 256, 128, 128) 61 | small = torch.rand(1,512,16,16) 62 | net = FusionBlock(256,512) 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /MDSSD_512/scripts/mdssd.py: -------------------------------------------------------------------------------- 1 | import math 2 | import itertools 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torch.nn.init as init 8 | 9 | from torch.autograd import Variable 10 | 11 | from multibox_layer import MultiBoxLayer 12 | from fusion import FusionBlock 13 | from norm import L2Norm 14 | 15 | class MDSSD300(nn.Module): 16 | input_size = 512 17 | 18 | def __init__(self): 19 | super(MDSSD300, self).__init__() 20 | 21 | # model 22 | self.base = self.VGG16() 23 | self.norm4 = L2Norm(512, 20) # 64 24 | 25 | self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1) 26 | self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1) 27 | self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1) 28 | 29 | self.conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) 30 | 31 | self.conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 32 | 33 | self.conv8_1 = nn.Conv2d(1024, 256, kernel_size=1) 34 | self.conv8_2 = nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2) #16 35 | 36 | self.conv9_1 = nn.Conv2d(512, 128, kernel_size=1) 37 | self.conv9_2 = nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2) #8 38 | 39 | self.conv10_1 = nn.Conv2d(256, 128, kernel_size=1) 40 | self.conv10_2 = nn.Conv2d(128, 256, kernel_size=3, padding=1,stride=2) #4 41 | 42 | self.conv11_1 = nn.Conv2d(256, 128, kernel_size=1) 43 | self.conv11_2 = nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2) #2 44 | 45 | self.conv12_1 = nn.Conv2d(256, 128, kernel_size=1) 46 | self.conv12_2 = nn.Conv2d(128, 256, kernel_size=2) 47 | 48 | self.Fusion1 = FusionBlock(256,512) 49 | self.Fusion2 = FusionBlock(512,256) 50 | self.Fusion3 = FusionBlock(1024,256) 51 | 52 | # multibox layer 53 | self.multibox = MultiBoxLayer() 54 | 55 | def forward(self, x): 56 | odd_count = 0 57 | odd = [] 58 | hs = [] 59 | vgg = [] 60 | fusion_layers = [] 61 | h = self.base[0](x) 62 | vgg.append(h) 63 | for i in range(1,len(self.base)): 64 | h = self.base[i](h) 65 | # print(h.size()) 66 | vgg.append(h) 67 | fusion_layers.append(vgg[15]) 68 | odd.append(2) 69 | odd_count = 3 70 | fusion_layers.append(h) 71 | h = F.max_pool2d(h, kernel_size=2, stride=2, ceil_mode=True) 72 | # print(h.size()) 73 | 74 | h = F.relu(self.conv5_1(h)) 75 | h = F.relu(self.conv5_2(h)) 76 | h = F.relu(self.conv5_3(h)) 77 | h = F.max_pool2d(h, kernel_size=3, padding=1, stride=1, ceil_mode=True) 78 | # print(h.size()) 79 | 80 | h = F.relu(self.conv6(h)) 81 | h = F.relu(self.conv7(h)) 82 | # print(h.size()) 83 | fusion_layers.append(h) 84 | 85 | h = F.relu(self.conv8_1(h)) 86 | h = F.relu(self.conv8_2(h)) 87 | # print(h.size()) 88 | hs.append(h) # conv8_2 89 | 90 | h = F.relu(self.conv9_1(h)) 91 | h = F.relu(self.conv9_2(h)) 92 | # print(h.size()) 93 | hs.append(h) # conv9_2 94 | 95 | h = F.relu(self.conv10_1(h)) 96 | h = F.relu(self.conv10_2(h)) 97 | # print(h.size()) 98 | hs.append(h) # conv10_2 99 | 100 | h = F.relu(self.conv11_1(h)) 101 | h = F.relu(self.conv11_2(h)) 102 | # print(h.size()) 103 | hs.append(h) # conv11_2 104 | 105 | h = F.relu(self.conv12_1(h)) 106 | h = F.relu(self.conv12_2(h)) 107 | # print(h.size()) 108 | hs.append(h) # conv12_2 109 | 110 | # Fusion Blocks 111 | f = self.Fusion1(fusion_layers[0],hs[-5]) 112 | hs.append(f) 113 | f = self.Fusion2(fusion_layers[1],hs[-5]) 114 | hs.append(f) 115 | f = self.Fusion3(fusion_layers[2],hs[-5]) 116 | hs.append(f) 117 | 118 | loc_preds, conf_preds = self.multibox(hs) 119 | 120 | return loc_preds, conf_preds 121 | 122 | def VGG16(self): 123 | '''VGG16 layers.''' 124 | cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512] 125 | layers = [] 126 | in_channels = 3 127 | for x in cfg: 128 | if x == 'M': 129 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 130 | else: 131 | layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1), 132 | nn.ReLU(True)] 133 | in_channels = x 134 | return nn.Sequential(*layers) 135 | 136 | if __name__ == '__main__': 137 | t = torch.randn(1, 3, 300, 300) 138 | net = MDSSD300() 139 | # print(net) 140 | res = net.forward(t) 141 | -------------------------------------------------------------------------------- /MDSSD_512/scripts/multibox_layer.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import math 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.init as init 8 | import torch.nn.functional as F 9 | 10 | 11 | class MultiBoxLayer(nn.Module): 12 | num_classes = 13 13 | num_anchors = [6,6,4,4,4,4,4,4] 14 | 15 | def __init__(self): 16 | super(MultiBoxLayer, self).__init__() 17 | self.in_planes = [512,256,256,256,256,256,512,1024] 18 | self.loc_layers = nn.ModuleList() 19 | self.conf_layers = nn.ModuleList() 20 | for i in range(len(self.in_planes)): 21 | self.loc_layers.append(nn.Conv2d(self.in_planes[i], self.num_anchors[i]*4, kernel_size=3, padding=1)) 22 | self.conf_layers.append(nn.Conv2d(self.in_planes[i], self.num_anchors[i]*13, kernel_size=3, padding=1)) 23 | 24 | def forward(self, xs): 25 | ''' 26 | Args: 27 | xs: (list) of tensor containing intermediate layer outputs. 28 | 29 | Returns: 30 | loc_preds: (tensor) predicted locations, sized [N,8732,4]. 31 | conf_preds: (tensor) predicted class confidences, sized [N,8732,21]. 32 | ''' 33 | y_locs = [] 34 | y_confs = [] 35 | for i, x in enumerate(xs): 36 | y_loc = self.loc_layers[i](x) 37 | N = y_loc.size(0) 38 | y_loc = y_loc.permute(0,2,3,1).contiguous() 39 | y_loc = y_loc.view(N,-1,4) 40 | y_locs.append(y_loc) 41 | 42 | y_conf = self.conf_layers[i](x) 43 | y_conf = y_conf.permute(0,2,3,1).contiguous() 44 | y_conf = y_conf.view(N,-1,13) 45 | y_confs.append(y_conf) 46 | 47 | loc_preds = torch.cat(y_locs, 1) 48 | conf_preds = torch.cat(y_confs, 1) 49 | return loc_preds, conf_preds 50 | -------------------------------------------------------------------------------- /MDSSD_512/scripts/multibox_loss.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import math 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.init as init 8 | import torch.nn.functional as F 9 | 10 | 11 | class MultiBoxLoss(nn.Module): 12 | num_classes = 13 13 | 14 | def __init__(self): 15 | super(MultiBoxLoss, self).__init__() 16 | 17 | def cross_entropy_loss(self, x, y): 18 | '''Cross entropy loss w/o averaging across all samples. 19 | 20 | Args: 21 | x: (tensor) sized [N,D]. 22 | y: (tensor) sized [N,]. 23 | 24 | Return: 25 | (tensor) cross entroy loss, sized [N,]. 26 | ''' 27 | xmax = x.data.max() 28 | print('x y size {} {}'.format(x.size(), y.size())) 29 | log_sum_exp = torch.log(torch.sum(torch.exp(x-xmax), 1)) + xmax 30 | print('log_sum_exp {}'.format(log_sum_exp.size())) 31 | return log_sum_exp - x.gather(1, y.view(-1,1)) 32 | 33 | def hard_negative_mining(self, conf_loss, pos): 34 | '''Return negative indices that is 3x the number as postive indices. 35 | 36 | Args: 37 | conf_loss: (tensor) cross entroy loss between conf_preds and conf_targets, sized [N*8732,]. 38 | pos: (tensor) positive(matched) box indices, sized [N,8732]. 39 | 40 | Return: 41 | (tensor) negative indices, sized [N,8732]. 42 | ''' 43 | batch_size, num_boxes = pos.size() 44 | conf_loss[pos.view(-1)] = 0 # set pos boxes = 0, the rest are neg conf_loss 45 | conf_loss = conf_loss.view(batch_size, -1) # [N,8732] 46 | 47 | _,idx = conf_loss.sort(1, descending=True) # sort by neg conf_loss 48 | _,rank = idx.sort(1) # [N,8732] 49 | 50 | num_pos = pos.long().sum(1) # [N,1] 51 | num_neg = torch.clamp(3*num_pos, max=num_boxes-1) # [N,1] 52 | 53 | neg = rank < num_neg.unsqueeze(1).expand_as(rank) 54 | 55 | return neg 56 | 57 | def forward(self, loc_preds, loc_targets, conf_preds, conf_targets): 58 | '''Compute loss between (loc_preds, loc_targets) and (conf_preds, conf_targets). 59 | 60 | Args: 61 | loc_preds: (tensor) predicted locations, sized [batch_size, 8732, 4]. 62 | loc_targets: (tensor) encoded target locations, sized [batch_size, 8732, 4]. 63 | conf_preds: (tensor) predicted class confidences, sized [batch_size, 8732, num_classes]. 64 | conf_targets: (tensor) encoded target classes, sized [batch_size, 8732]. 65 | 66 | loss: 67 | (tensor) loss = SmoothL1Loss(loc_preds, loc_targets) + CrossEntropyLoss(conf_preds, conf_targets). 68 | ''' 69 | 70 | # loc_preds = loc_preds[:,:8732,:] 71 | # conf_preds = conf_preds[:,:8732,:] 72 | 73 | batch_size, num_boxes, _ = loc_preds.size() 74 | pos = conf_targets > 0 # [N,8732], pos means the box matched. 75 | # print(pos.size()) 76 | num_matched_boxes = pos.data.float().sum() 77 | if num_matched_boxes == 0: 78 | return torch.tensor([0.], requires_grad=True) 79 | 80 | ################################################################ 81 | # loc_loss = SmoothL1Loss(pos_loc_preds, pos_loc_targets) 82 | ################################################################ 83 | pos_mask = pos.unsqueeze(2).expand_as(loc_preds) # [N,8732,4] 84 | pos_loc_preds = loc_preds[pos_mask].view(-1,4) # [#pos,4] 85 | pos_loc_targets = loc_targets[pos_mask].view(-1,4) # [#pos,4] 86 | 87 | loc_loss = F.smooth_l1_loss(pos_loc_preds, pos_loc_targets, size_average=False) 88 | 89 | ################################################################ 90 | # conf_loss = CrossEntropyLoss(pos_conf_preds, pos_conf_targets) 91 | # + CrossEntropyLoss(neg_conf_preds, neg_conf_targets) 92 | ################################################################ 93 | conf_preds = conf_preds.contiguous() 94 | # print(conf_preds.size(), conf_targets.size()) 95 | conf_loss = F.cross_entropy(conf_preds.view(-1,self.num_classes), \ 96 | conf_targets.view(-1), reduce=False) # [N*8732,] 97 | neg = self.hard_negative_mining(conf_loss, pos) # [N,8732] 98 | 99 | pos_mask = pos.unsqueeze(2).expand_as(conf_preds) # [N,8732,21] 100 | neg_mask = neg.unsqueeze(2).expand_as(conf_preds) # [N,8732,21] 101 | mask = (pos_mask+neg_mask).gt(0) 102 | 103 | pos_and_neg = (pos+neg).gt(0) 104 | preds = conf_preds[mask].view(-1,self.num_classes) # [#pos+#neg,21] 105 | targets = conf_targets[pos_and_neg] # [#pos+#neg,] 106 | conf_loss = F.cross_entropy(preds, targets, size_average=False) 107 | 108 | loc_loss /= num_matched_boxes 109 | conf_loss /= num_matched_boxes 110 | 111 | return loc_loss + conf_loss 112 | -------------------------------------------------------------------------------- /MDSSD_512/scripts/norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.nn.init as init 5 | 6 | class L2Norm(nn.Module): 7 | '''L2Norm layer across all channels and scale.''' 8 | def __init__(self, in_features,scale): 9 | super(L2Norm, self).__init__() 10 | self.weight = nn.Parameter(torch.Tensor(in_features)) 11 | self.reset_parameters(scale) 12 | 13 | def reset_parameters(self, scale): 14 | nn.init.constant_(self.weight, scale) 15 | 16 | def forward(self, x): 17 | x = F.normalize(x, dim=1) 18 | scale = self.weight[None,:,None,None] 19 | return scale * x -------------------------------------------------------------------------------- /MDSSD_512/scripts/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | import torch.nn.functional as F 4 | import torchvision.transforms as transforms 5 | 6 | import sys 7 | from mdssd import MDSSD300 8 | from encoder import DataEncoder 9 | import cv2 10 | 11 | VOC_LABELS = ( 12 | 'ignored regions', 13 | 'pedestrian', 14 | 'people', 15 | 'bicycle', 16 | 'car', 17 | 'van', 18 | 'truck', 19 | 'tricycle', 20 | 'awning-tricycle', 21 | 'bus', 22 | 'motor', 23 | 'other' 24 | ) 25 | 26 | 27 | # Load model 28 | net = MDSSD300() 29 | checkpoint = torch.load('./checkpoint/ckpt.pth') 30 | 31 | keys = [] 32 | for k,v in checkpoint['net'].items(): 33 | if "module" in k: 34 | keys.append(k) 35 | for i in keys: 36 | checkpoint['net'][i[7:]] = checkpoint['net'][i] 37 | del checkpoint['net'][i] 38 | 39 | net.load_state_dict(checkpoint['net']) 40 | net.eval() 41 | 42 | if len(sys.argv) == 2: 43 | img_path = sys.argv[1] 44 | else: 45 | img_path = './images/img5.jpg' 46 | 47 | # Load test image 48 | img = cv2.imread(img_path) 49 | img1 = cv2.resize(img, (512, 312)) 50 | transform = transforms.Compose([transforms.ToTensor(), 51 | transforms.Normalize(mean=(0.356, 0.368, 0.362), std=(0.242, 0.235, 0.236))]) 52 | img1 = transform(img1) 53 | 54 | # Forward 55 | with torch.no_grad(): 56 | x = torch.tensor(img1) 57 | loc_preds, conf = net(x.unsqueeze(0)) 58 | # Decode 59 | data_encoder = DataEncoder() 60 | boxes, labels, scores = data_encoder.decode(loc_preds.data.squeeze(0), F.softmax(conf.squeeze(0), dim=1).data) 61 | for box, label, score in zip(boxes, labels, scores): 62 | for b, s in zip(box, score): 63 | if s > 0.25: 64 | b[::2] *= img.shape[1] 65 | b[1::2] *= img.shape[0] 66 | print('label:',VOC_LABELS[int(label[0])], 'score:', score) 67 | b = list(b) 68 | cv2.rectangle(img, (b[0], b[1]), (b[2], b[3]), (255, 255, 255), 2) 69 | title = '{}: {}'.format(VOC_LABELS[int(label[0])], round(float(score[0]), 2)) 70 | cv2.putText(img, title, (b[0], b[1]), cv2.FONT_ITALIC, 0.6, (0, 255, 0), 2) 71 | cv2.imshow('img', img) 72 | cv2.waitKey(0) -------------------------------------------------------------------------------- /MDSSD_512/scripts/train.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | import argparse 5 | import itertools 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.optim as optim 10 | import torch.nn.functional as F 11 | import torch.backends.cudnn as cudnn 12 | 13 | import torchvision 14 | import torchvision.transforms as transforms 15 | 16 | import numpy as np 17 | from mdssd import MDSSD300 18 | from datagen import ListDataset 19 | from multibox_loss import MultiBoxLoss 20 | 21 | lr = 0.001 22 | resume = False # Resume from checkpoint 23 | epoch = 200 24 | batch_size = 8 25 | 26 | use_cuda = torch.cuda.is_available() 27 | best_loss = float('inf') # best test loss 28 | start_epoch = 0 # start from epoch 0 or last epoch 29 | 30 | # Data 31 | print('==> Preparing data..') 32 | transform = transforms.Compose([transforms.ToTensor(), 33 | transforms.Normalize(mean=(0.356, 0.368, 0.362), std=(0.242, 0.235, 0.236))]) 34 | 35 | trainset = ListDataset(root='../../../../VisDrone2019/dataset/VisDrone2018-DET-train/images/', list_file='../../../../VisDrone2019/dataset/VisDrone2018-DET-train/annotations/', train=True, transform=transform) 36 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=8, shuffle=True, num_workers=4) 37 | 38 | valset = ListDataset(root='../../../../VisDrone2019/dataset/VisDrone2019-DET-val/images/', list_file='../../../../VisDrone2019/dataset/VisDrone2019-DET-val/annotations/', train=True, transform=transform) 39 | valloader = torch.utils.data.DataLoader(valset, batch_size=8, shuffle=True, num_workers=4) 40 | 41 | # Model 42 | net = MDSSD300() 43 | if resume: 44 | print('==> Resuming from checkpoint..') 45 | checkpoint = torch.load('./checkpoint/ckpt.pth') 46 | 47 | keys = [] 48 | for k,v in checkpoint['net'].items(): 49 | if "module" in k: 50 | keys.append(k) 51 | for i in keys: 52 | checkpoint['net'][i[7:]] = checkpoint['net'][i] 53 | del checkpoint['net'][i] 54 | 55 | net.load_state_dict(checkpoint['net']) 56 | best_loss = checkpoint['loss'] 57 | start_epoch = checkpoint['epoch'] 58 | else: 59 | # Convert from pretrained VGG model. 60 | try: 61 | net.load_state_dict(torch.load('../model/ssd.pth')) 62 | print('==> Pretrain model read successfully') 63 | except: 64 | print('==> Pretrain model read failed or not existed, training from init') 65 | 66 | criterion = MultiBoxLoss() 67 | 68 | if use_cuda: 69 | net = torch.nn.DataParallel(net, device_ids=[0]) 70 | net.cuda() 71 | cudnn.benchmark = True 72 | 73 | optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4) 74 | # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.25) 75 | 76 | # Training 77 | def train(epoch,prev_val_loss, last_saved): 78 | print('\nEpoch: %d' % epoch) 79 | net.train() 80 | train_loss = 0 81 | for batch_idx, (images, loc_targets, conf_targets) in enumerate(trainloader): 82 | if use_cuda: 83 | images = images.cuda() 84 | loc_targets = loc_targets.cuda() 85 | conf_targets = conf_targets.cuda() 86 | 87 | images = torch.tensor(images) 88 | loc_targets = torch.tensor(loc_targets) 89 | conf_targets = torch.tensor(conf_targets) 90 | 91 | optimizer.zero_grad() 92 | loc_preds, conf_preds = net(images) 93 | loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets) 94 | loss.backward() 95 | optimizer.step() 96 | # scheduler.step() 97 | 98 | train_loss += loss.item() 99 | 100 | if batch_idx%100 == 0: 101 | val_loss_tot = 0 102 | for batch_idx_val, (images, loc_targets, conf_targets) in enumerate(valloader): 103 | if use_cuda: 104 | images = images.cuda() 105 | loc_targets = loc_targets.cuda() 106 | conf_targets = conf_targets.cuda() 107 | 108 | images = torch.tensor(images) 109 | loc_targets = torch.tensor(loc_targets) 110 | conf_targets = torch.tensor(conf_targets) 111 | 112 | loc_preds, conf_preds = net(images) 113 | val_loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets) 114 | val_loss_tot += val_loss.item() 115 | 116 | val_loss_tot /= (batch_idx_val+1) 117 | if val_loss_tot < prev_val_loss: 118 | os.makedirs('checkpoint', exist_ok=True) 119 | torch.save({ 120 | 'epoch': epoch, 121 | 'net': net.state_dict(), 122 | 'loss': loss, 123 | }, 'checkpoint/ckpt.pth') 124 | print("Saved.") 125 | prev_val_loss = val_loss_tot 126 | last_saved = [epoch, batch_idx] 127 | 128 | print('epoch: {}, batch_idx: {},loss: {}, train_loss: {}, best_val_loss: {}, last_saved: {}'.format(epoch, batch_idx, loss.item(), train_loss/(batch_idx+1), prev_val_loss, last_saved)) 129 | 130 | return prev_val_loss, last_saved 131 | 132 | 133 | prev_val_loss = 999 134 | last_saved = [start_epoch,0] 135 | for epoch_num in range(start_epoch, start_epoch+epoch): 136 | prev_val_loss, last_saved = train(epoch_num, prev_val_loss, last_saved) 137 | -------------------------------------------------------------------------------- /MDSSD_augment/scripts/augment.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import pandas as pd 3 | import shutil 4 | import os 5 | import numpy as np 6 | import glob 7 | import xml.etree.ElementTree as ET 8 | from xml.dom import minidom 9 | import random 10 | 11 | IMG_DIR = '../../../../VisDrone2019/dataset/VisDrone2018-DET-train/images/' 12 | ANNOT_DIR = '../../../../VisDrone2019/dataset/VisDrone2018-DET-train/annotations/' 13 | 14 | for i in os.listdir(ANNOT_DIR): 15 | box = [] 16 | labels = [] 17 | with open(os.path.join(ANNOT_DIR,i)) as f: 18 | f = f.read().split("\n") 19 | f = f[:-1] 20 | num_objs = len(f) 21 | 22 | for j in range(num_objs): 23 | f[j] = f[j].split(",") 24 | xmin = float(f[j][0]) 25 | ymin = float(f[j][1]) 26 | w = float(f[j][2]) 27 | h = float(f[j][3]) 28 | 29 | box.append([xmin,ymin,w,h]) 30 | labels.append(int(f[j][5])) 31 | 32 | img = cv2.imread(IMG_DIR+i[:-4]+".jpg") 33 | box_new = box.copy() 34 | img_new = img.copy() 35 | # cv2.imshow("Image", img) 36 | # cv2.waitKey(0) 37 | 38 | for j in box: 39 | if j[2]*j[3]<500: 40 | crop = img[int(j[1]):int(j[1]+j[3]),int(j[0]):int(j[0]+j[2])] 41 | x = random.randrange(0, img.shape[1],1) 42 | y = random.randrange(0, img.shape[0],1) 43 | 44 | try: 45 | img_new[int(y):int(y+j[3]),int(x):int(x+j[2])] = crop 46 | box_new.append([x,y,j[2],j[3]]) 47 | except: 48 | continue 49 | for j in box_new: 50 | img_new = cv2.rectangle(img_new,(int(j[0]),int(j[1])),(int(j[0]+j[2]),int(j[1]+j[3])),(255,0,0),1) 51 | 52 | cv2.imshow("Image", img_new) 53 | cv2.waitKey(0) 54 | break -------------------------------------------------------------------------------- /MDSSD_augment/scripts/datagen.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Load image/class/box from a annotation file. 3 | 4 | ''' 5 | from __future__ import print_function 6 | 7 | import os 8 | import sys 9 | import os.path 10 | 11 | import random 12 | import numpy as np 13 | 14 | import torch 15 | import torch.utils.data as data 16 | import torchvision.transforms as transforms 17 | 18 | from encoder import DataEncoder 19 | import cv2 20 | 21 | import pandas as pd 22 | import shutil 23 | import os 24 | import numpy as np 25 | import glob 26 | import xml.etree.ElementTree as ET 27 | from xml.dom import minidom 28 | 29 | class ListDataset(data.Dataset): 30 | img_size = 300 31 | 32 | def __init__(self, root, list_file, train, transform): 33 | ''' 34 | Args: 35 | root: (str) ditectory to images. 36 | list_file: (str) path to annotation files. 37 | train: (boolean) train or test. 38 | transform: ([transforms]) image transforms. 39 | ''' 40 | self.root = root 41 | self.list_file = list_file 42 | self.train = train 43 | self.transform = transform 44 | 45 | self.fnames = [] 46 | self.boxes = [] 47 | self.labels = [] 48 | 49 | self.data_encoder = DataEncoder() 50 | self.num_samples = 0 51 | 52 | for i in os.listdir(list_file): 53 | self.num_samples += 1 54 | self.fnames.append(i) 55 | 56 | def __getitem__(self, idx): 57 | '''Load a image, and encode its bbox locations and class labels. 58 | Args: 59 | idx: (int) image index. 60 | Returns: 61 | img: (tensor) image tensor. 62 | loc_target: (tensor) location targets, sized [8732,4]. 63 | conf_target: (tensor) label targets, sized [8732,]. 64 | ''' 65 | # Load image and bbox locations. 66 | fname = self.fnames[idx] 67 | img = cv2.imread(os.path.join(self.root, fname[:-4]+".jpg")) 68 | 69 | box = [] 70 | label = [] 71 | with open(os.path.join(self.list_file,fname)) as f: 72 | f = f.read().split("\n") 73 | f = f[:-1] 74 | num_objs = len(f) 75 | 76 | for j in range(num_objs): 77 | f[j] = f[j].split(",") 78 | xmin = float(f[j][0]) 79 | ymin = float(f[j][1]) 80 | w = float(f[j][2]) 81 | h = float(f[j][3]) 82 | 83 | box.append([xmin,ymin,xmin+w,ymin+h]) 84 | label.append(int(f[j][5])) 85 | 86 | # **************************** AUGMENTATION ************************************ 87 | # Copy and paste small objects at random locations in 88 | # image to increase the number of samples with small sizes. 89 | box_new = box.copy() 90 | label_new = label.copy() 91 | img_new = img.copy() 92 | for n in range(len(box)): 93 | j = box[n] 94 | if j[2]*j[3]<500: 95 | crop = img[int(j[1]):int(j[1]+j[3]),int(j[0]):int(j[0]+j[2])] 96 | x = random.randrange(0, img.shape[1],1) 97 | y = random.randrange(0, img.shape[0],1) 98 | 99 | try: 100 | img_new[int(y):int(y+j[3]),int(x):int(x+j[2])] = crop 101 | box_new.append([x,y,j[2],j[3]]) 102 | label_new.append(label[n]) 103 | except: 104 | continue 105 | 106 | # ******************************************************************************** 107 | 108 | self.boxes.append(torch.Tensor(box_new)) 109 | self.labels.append(torch.LongTensor(label_new)) 110 | img = img_new 111 | 112 | boxes = self.boxes[-1].clone() 113 | labels = self.labels[-1] 114 | 115 | # Data augmentation while training. 116 | if self.train: 117 | img, boxes = self.random_flip(img, boxes) 118 | img, boxes, labels = self.random_crop(img, boxes, labels) 119 | 120 | # Scale bbox locaitons to [0,1]. 121 | w,h = img.shape[1], img.shape[0] 122 | boxes /= torch.Tensor([w,h,w,h]).expand_as(boxes) 123 | img = cv2.resize(img, (self.img_size,self.img_size)) 124 | img = self.transform(img) 125 | 126 | # Encode loc & conf targets. 127 | 128 | loc_target, conf_target = self.data_encoder.encode(boxes, labels) 129 | return img, loc_target, conf_target 130 | 131 | def random_flip(self, img, boxes): 132 | '''Randomly flip the image and adjust the bbox locations. 133 | For bbox (xmin, ymin, xmax, ymax), the flipped bbox is: 134 | (w-xmax, ymin, w-xmin, ymax). 135 | Args: 136 | img: (ndarray.Image) image. f 137 | boxes: (tensor) bbox locations, sized [#obj, 4]. 138 | Returns: 139 | img: (ndarray.Image) randomly flipped image. 140 | boxes: (tensor) randomly flipped bbox locations, sized [#obj, 4]. 141 | ''' 142 | if random.random() < 0.5: 143 | img = cv2.flip(img, 1) 144 | w = img.shape[1] 145 | xmin = w - boxes[:,2] 146 | xmax = w - boxes[:,0] 147 | boxes[:,0] = xmin 148 | boxes[:,2] = xmax 149 | return img, boxes 150 | 151 | def random_crop(self, img, boxes, labels): 152 | '''Randomly crop the image and adjust the bbox locations. 153 | For more details, see 'Chapter2.2: Data augmentation' of the paper. 154 | Args: 155 | img: (ndarray.Image) image. 156 | boxes: (tensor) bbox locations, sized [#obj, 4]. 157 | labels: (tensor) bbox labels, sized [#obj,]. 158 | Returns: 159 | img: (ndarray.Image) cropped image. 160 | selected_boxes: (tensor) selected bbox locations. 161 | labels: (tensor) selected bbox labels. 162 | ''' 163 | imw, imh = img.shape[1], img.shape[0] 164 | while True: 165 | min_iou = random.choice([None, 0.1, 0.3, 0.5, 0.7, 0.9])# random choice the one 166 | if min_iou is None: 167 | return img, boxes, labels 168 | 169 | for _ in range(100): 170 | w = random.randrange(int(0.1*imw), imw) 171 | h = random.randrange(int(0.1*imh), imh) 172 | 173 | if h > 2*w or w > 2*h or h < 1 or w < 1: 174 | continue 175 | 176 | x = random.randrange(imw - w) 177 | y = random.randrange(imh - h) 178 | roi = torch.Tensor([[x, y, x+w, y+h]]) 179 | 180 | center = (boxes[:,:2] + boxes[:,2:]) / 2 # [N,2] 181 | roi2 = roi.expand(len(center), 4) # [N,4] 182 | 183 | mask = (center > roi2[:,:2]) & (center < roi2[:,2:]) # [N,2] 184 | mask = mask[:,0] & mask[:,1] #[N,] 185 | 186 | if not mask.any(): 187 | continue 188 | 189 | selected_boxes = boxes.index_select(0, mask.nonzero().squeeze(1)) 190 | 191 | iou = self.data_encoder.iou(selected_boxes, roi) 192 | if iou.min() < min_iou: 193 | continue 194 | img = img[y:y+h, x:x+w, :] 195 | 196 | selected_boxes[:,0].add_(-x).clamp_(min=0, max=w) 197 | selected_boxes[:,1].add_(-y).clamp_(min=0, max=h) 198 | selected_boxes[:,2].add_(-x).clamp_(min=0, max=w) 199 | selected_boxes[:,3].add_(-y).clamp_(min=0, max=h) 200 | 201 | return img, selected_boxes, labels[mask] 202 | 203 | def __len__(self): 204 | return self.num_samples -------------------------------------------------------------------------------- /MDSSD_augment/scripts/encoder.py: -------------------------------------------------------------------------------- 1 | '''Encode target locations and labels.''' 2 | import torch 3 | 4 | import math 5 | import itertools 6 | 7 | class DataEncoder: 8 | def __init__(self): 9 | '''Compute default box sizes with scale and aspect transform.''' 10 | scale = 300. 11 | steps = [s / scale for s in (32, 64, 100, 300, 4, 8, 16)] 12 | sizes_ssd = [s / scale for s in (111, 162, 213, 264, 315)] 13 | sizes_fusion = [s / scale for s in (15, 30, 60, 111)] 14 | aspect_ratios = ((2,3), (2,3), (2,), (2,), (2,), (2,), (2,)) 15 | feature_map_sizes = (10, 5, 3, 1, 75, 38, 19) 16 | num_layers = len(feature_map_sizes) 17 | 18 | boxes = [] 19 | for i in range(num_layers): 20 | fmsize = feature_map_sizes[i] # feature map size 21 | for h,w in itertools.product(range(fmsize), repeat=2): 22 | cx = (w + 0.5)*steps[i] 23 | cy = (h + 0.5)*steps[i] 24 | 25 | if i<4: 26 | s = sizes_ssd[i] 27 | boxes.append((cx, cy, s, s)) 28 | 29 | s = math.sqrt(sizes_ssd[i] * sizes_ssd[i+1]) 30 | boxes.append((cx, cy, s, s)) 31 | 32 | s = sizes_ssd[i] 33 | for ar in aspect_ratios[i]: 34 | boxes.append((cx, cy, s * math.sqrt(ar), s / math.sqrt(ar))) 35 | boxes.append((cx, cy, s / math.sqrt(ar), s * math.sqrt(ar))) 36 | 37 | else: 38 | s = sizes_fusion[i-4] 39 | boxes.append((cx, cy, s, s)) 40 | 41 | s = math.sqrt(sizes_fusion[i-4] * sizes_fusion[i-4+1]) 42 | boxes.append((cx, cy, s, s)) 43 | 44 | s = sizes_fusion[i-4] 45 | for ar in aspect_ratios[i]: 46 | boxes.append((cx, cy, s * math.sqrt(ar), s / math.sqrt(ar))) 47 | boxes.append((cx, cy, s / math.sqrt(ar), s * math.sqrt(ar))) 48 | 49 | self.default_boxes = torch.Tensor(boxes) 50 | 51 | def iou(self, box1, box2): 52 | '''Compute the intersection over union of two set of boxes, each box is [x1,y1,x2,y2]. 53 | 54 | Args: 55 | box1: (tensor) bounding boxes, sized [N,4]. 56 | box2: (tensor) bounding boxes, sized [M,4]. 57 | 58 | Return: 59 | (tensor) iou, sized [N,M]. 60 | ''' 61 | N = box1.size(0) 62 | M = box2.size(0) 63 | 64 | lt = torch.max( 65 | box1[:,:2].unsqueeze(1).expand(N,M,2), # [N,2] -> [N,1,2] -> [N,M,2] 66 | box2[:,:2].unsqueeze(0).expand(N,M,2), # [M,2] -> [1,M,2] -> [N,M,2] 67 | ) 68 | 69 | rb = torch.min( 70 | box1[:,2:].unsqueeze(1).expand(N,M,2), # [N,2] -> [N,1,2] -> [N,M,2] 71 | box2[:,2:].unsqueeze(0).expand(N,M,2), # [M,2] -> [1,M,2] -> [N,M,2] 72 | ) 73 | 74 | wh = rb - lt # [N,M,2] 75 | wh[wh<0] = 0 # clip at 0 76 | inter = wh[:,:,0] * wh[:,:,1] # [N,M] 77 | 78 | area1 = (box1[:,2]-box1[:,0]) * (box1[:,3]-box1[:,1]) # [N,] 79 | area2 = (box2[:,2]-box2[:,0]) * (box2[:,3]-box2[:,1]) # [M,] 80 | area1 = area1.unsqueeze(1).expand_as(inter) # [N,] -> [N,1] -> [N,M] 81 | area2 = area2.unsqueeze(0).expand_as(inter) # [M,] -> [1,M] -> [N,M] 82 | 83 | iou = inter / (area1 + area2 - inter) 84 | return iou 85 | 86 | def encode(self, boxes, classes, threshold=0.5): 87 | '''Transform target bounding boxes and class labels to SSD boxes and classes. 88 | 89 | Match each object box to all the default boxes, pick the ones with the 90 | Jaccard-Index > 0.5: 91 | Jaccard(A,B) = AB / (A+B-AB) 92 | 93 | Args: 94 | boxes: (tensor) object bounding boxes (xmin,ymin,xmax,ymax) of a image, sized [#obj, 4]. 95 | classes: (tensor) object class labels of a image, sized [#obj,]. 96 | threshold: (float) Jaccard index threshold 97 | 98 | Returns: 99 | boxes: (tensor) bounding boxes, sized [#obj, 8732, 4]. 100 | classes: (tensor) class labels, sized [8732,] 101 | ''' 102 | default_boxes = self.default_boxes 103 | num_default_boxes = default_boxes.size(0) 104 | num_objs = boxes.size(0) 105 | 106 | iou = self.iou( # [#obj,8732] 107 | boxes, 108 | torch.cat([default_boxes[:,:2] - default_boxes[:,2:]/2, 109 | default_boxes[:,:2] + default_boxes[:,2:]/2], 1) 110 | ) 111 | 112 | iou, max_idx = iou.max(0) # [1,8732] 113 | max_idx.squeeze_(0) # [8732,] 114 | iou.squeeze_(0) # [8732,] 115 | 116 | boxes = boxes[max_idx] # [8732,4] 117 | variances = [0.1, 0.2] 118 | cxcy = (boxes[:,:2] + boxes[:,2:])/2 - default_boxes[:,:2] # [8732,2] 119 | cxcy /= variances[0] * default_boxes[:,2:] 120 | wh = (boxes[:,2:] - boxes[:,:2]) / default_boxes[:,2:] # [8732,2] 121 | wh = torch.log(wh) / variances[1] 122 | loc = torch.cat([cxcy, wh], 1) # [8732,4] 123 | 124 | conf = 1 + classes[max_idx] # [8732,], background class = 0 125 | conf[iou 0: 153 | try: 154 | i = order[0] 155 | except: 156 | i = order 157 | keep.append(i) 158 | 159 | if order.size == 1: 160 | break 161 | 162 | xx1 = x1[order[1:]].clamp(min=x1[i]) 163 | yy1 = y1[order[1:]].clamp(min=y1[i]) 164 | xx2 = x2[order[1:]].clamp(max=x2[i]) 165 | yy2 = y2[order[1:]].clamp(max=y2[i]) 166 | 167 | w = (xx2-xx1).clamp(min=0) 168 | h = (yy2-yy1).clamp(min=0) 169 | inter = w*h 170 | 171 | if mode == 'union': 172 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 173 | elif mode == 'min': 174 | ovr = inter / areas[order[1:]].clamp(max=areas[i]) 175 | else: 176 | raise TypeError('Unknown nms mode: %s.' % mode) 177 | 178 | ids = (ovr<=threshold).nonzero().squeeze() 179 | if ids.size == 0: 180 | break 181 | order = order[ids+1] 182 | return torch.LongTensor(keep) 183 | 184 | def decode(self, loc, conf): 185 | '''Transform predicted loc/conf back to real bbox locations and class labels. 186 | 187 | Args: 188 | loc: (tensor) predicted loc, sized [8732,4]. 189 | conf: (tensor) predicted conf, sized [8732,21]. 190 | 191 | Returns: 192 | boxes: (tensor) bbox locations, sized [#obj, 4]. 193 | labels: (tensor) class labels, sized [#obj,1]. 194 | ''' 195 | variances = (0.1, 0.2) 196 | wh = torch.exp(loc[:,2:]*variances[1]) * self.default_boxes[:,2:] 197 | cxcy = loc[:,:2] * variances[0] * self.default_boxes[:,2:] + self.default_boxes[:,:2] 198 | box_preds = torch.cat([cxcy-wh/2, cxcy+wh/2], 1) # [8732,4] 199 | 200 | boxes = [] 201 | labels = [] 202 | scores = [] 203 | num_classes = conf.size(1) 204 | for i in range(num_classes-1): 205 | score = conf[:,i+1] # class i corresponds to (i+1) column 206 | mask = score > 0.1 207 | 208 | if not mask.any(): 209 | continue 210 | 211 | box = box_preds[mask.nonzero().squeeze()] 212 | score = score[mask] 213 | 214 | if len(score) == 1: 215 | continue 216 | keep = self.nms(box, score, threshold=0.3) 217 | boxes.append(box[keep]) 218 | labels.append(torch.LongTensor(len(box[keep])).fill_(i)) 219 | scores.append(score[keep]) 220 | 221 | return boxes, labels, scores 222 | -------------------------------------------------------------------------------- /MDSSD_augment/scripts/fusion.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.nn.init as init 5 | 6 | from torch.autograd import Variable 7 | 8 | from norm import L2Norm 9 | 10 | class FusionBlock(nn.Module): 11 | def __init__(self, big_features, small_features): 12 | super(FusionBlock, self).__init__() 13 | 14 | # Bigger feature map 15 | self.conv1_1 = nn.Conv2d(big_features, 256, kernel_size=3, padding=1, dilation=1) 16 | self.Norm1 = L2Norm(256, 20) 17 | 18 | # Smaller feature map 19 | self.deconv2_1 = nn.ConvTranspose2d(small_features, 256, 2, stride=2, dilation=1) 20 | self.conv2_1 = nn.Conv2d(256, 256, kernel_size=3, padding=1, dilation=1) 21 | self.deconv2_2 = nn.ConvTranspose2d(256, 256, 2, stride=2, dilation=1) 22 | self.conv2_2 = nn.Conv2d(256, 256, kernel_size=3, padding=1, dilation=1) 23 | self.deconv2_3 = nn.ConvTranspose2d(256, 256, 3, stride=2, dilation=1) 24 | self.conv2_3 = nn.Conv2d(256, 256, kernel_size=3, padding=1, dilation=1) 25 | self.Norm2 = L2Norm(256, 20) 26 | 27 | # Common 28 | self.conv3_1 = nn.Conv2d(256, big_features, kernel_size=3, padding=1, dilation=1) 29 | 30 | 31 | def forward(self, big, small): 32 | h1 = self.conv1_1(big) 33 | h1 = self.Norm1(h1) 34 | 35 | h2 = self.deconv2_1(small) 36 | # print(h2.size()) 37 | h2 = F.relu(self.conv2_1(h2)) 38 | # print(h2.size()) 39 | h2 = self.deconv2_2(h2) 40 | # print(h2.size()) 41 | h2 = F.relu(self.conv2_2(h2)) 42 | # print(h2.size()) 43 | h2 = self.deconv2_3(h2) 44 | # print(h2.size()) 45 | h2 = self.conv2_3(h2) 46 | # print(h2.size()) 47 | h2 = self.Norm2(h2) 48 | 49 | size = h2.size()[3] 50 | diff_odd = h2.size()[-1] - h1.size()[-1] 51 | h2 = h2[:,:,(int(diff_odd/2)+diff_odd%2):(size-int(diff_odd/2)),(int(diff_odd/2)+diff_odd%2):(size-int(diff_odd/2))] 52 | 53 | # print(h1.size(), h2.size()) 54 | h = F.relu(h1+h2) 55 | h = F.relu(self.conv3_1(h)) 56 | 57 | return h 58 | 59 | if __name__ == '__main__': 60 | big = torch.randn(1, 256, 128, 128) 61 | small = torch.rand(1,512,16,16) 62 | net = FusionBlock(256,512) 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /MDSSD_augment/scripts/mdssd.py: -------------------------------------------------------------------------------- 1 | import math 2 | import itertools 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torch.nn.init as init 8 | 9 | from torch.autograd import Variable 10 | 11 | from multibox_layer import MultiBoxLayer 12 | from fusion import FusionBlock 13 | from norm import L2Norm 14 | 15 | class MDSSD300(nn.Module): 16 | input_size = 300 17 | 18 | def __init__(self): 19 | super(MDSSD300, self).__init__() 20 | 21 | # model 22 | self.base = self.VGG16() 23 | self.norm4 = L2Norm(512, 20) # 38 24 | 25 | self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1) 26 | self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1) 27 | self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1) 28 | 29 | self.conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) 30 | 31 | self.conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 32 | 33 | self.conv8_1 = nn.Conv2d(1024, 256, kernel_size=1) 34 | self.conv8_2 = nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2) 35 | 36 | self.conv9_1 = nn.Conv2d(512, 128, kernel_size=1) 37 | self.conv9_2 = nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2) 38 | 39 | self.conv10_1 = nn.Conv2d(256, 128, kernel_size=1) 40 | self.conv10_2 = nn.Conv2d(128, 256, kernel_size=3) 41 | 42 | self.conv11_1 = nn.Conv2d(256, 128, kernel_size=1) 43 | self.conv11_2 = nn.Conv2d(128, 256, kernel_size=3) 44 | 45 | self.Fusion1 = FusionBlock(256,512) 46 | self.Fusion2 = FusionBlock(512,256) 47 | self.Fusion3 = FusionBlock(1024,256) 48 | 49 | # multibox layer 50 | self.multibox = MultiBoxLayer() 51 | 52 | def forward(self, x): 53 | odd_count = 0 54 | odd = [] 55 | hs = [] 56 | vgg = [] 57 | fusion_layers = [] 58 | h = self.base[0](x) 59 | vgg.append(h) 60 | for i in range(1,len(self.base)): 61 | h = self.base[i](h) 62 | vgg.append(h) 63 | fusion_layers.append(vgg[15]) 64 | odd.append(2) 65 | odd_count = 3 66 | fusion_layers.append(h) 67 | h = F.max_pool2d(h, kernel_size=2, stride=2, ceil_mode=True) 68 | 69 | h = F.relu(self.conv5_1(h)) 70 | h = F.relu(self.conv5_2(h)) 71 | h = F.relu(self.conv5_3(h)) 72 | h = F.max_pool2d(h, kernel_size=3, padding=1, stride=1, ceil_mode=True) 73 | 74 | h = F.relu(self.conv6(h)) 75 | h = F.relu(self.conv7(h)) 76 | fusion_layers.append(h) 77 | 78 | h = F.relu(self.conv8_1(h)) 79 | h = F.relu(self.conv8_2(h)) 80 | hs.append(h) # conv8_2 81 | 82 | h = F.relu(self.conv9_1(h)) 83 | h = F.relu(self.conv9_2(h)) 84 | hs.append(h) # conv9_2 85 | 86 | h = F.relu(self.conv10_1(h)) 87 | h = F.relu(self.conv10_2(h)) 88 | hs.append(h) # conv10_2 89 | 90 | h = F.relu(self.conv11_1(h)) 91 | h = F.relu(self.conv11_2(h)) 92 | hs.append(h) # conv11_2 93 | 94 | # Fusion Blocks 95 | f = self.Fusion1(fusion_layers[0],hs[-4]) 96 | hs.append(f) 97 | f = self.Fusion2(fusion_layers[1],hs[-4]) 98 | hs.append(f) 99 | diff_odd = fusion_layers[2].size()[-1] - hs[-4].size()[-1] 100 | f = self.Fusion3(fusion_layers[2],hs[-4]) 101 | hs.append(f) 102 | 103 | loc_preds, conf_preds = self.multibox(hs) 104 | 105 | return loc_preds, conf_preds 106 | 107 | def VGG16(self): 108 | '''VGG16 layers.''' 109 | cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512] 110 | layers = [] 111 | in_channels = 3 112 | for x in cfg: 113 | if x == 'M': 114 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 115 | else: 116 | layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1), 117 | nn.ReLU(True)] 118 | in_channels = x 119 | return nn.Sequential(*layers) 120 | 121 | if __name__ == '__main__': 122 | t = torch.randn(1, 3, 300, 300) 123 | net = MDSSD300() 124 | # print(net) 125 | res = net.forward(t) 126 | -------------------------------------------------------------------------------- /MDSSD_augment/scripts/multibox_layer.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import math 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.init as init 8 | import torch.nn.functional as F 9 | 10 | 11 | class MultiBoxLayer(nn.Module): 12 | num_classes = 13 13 | num_anchors = [6,6,4,4,4,4,4] 14 | 15 | def __init__(self): 16 | super(MultiBoxLayer, self).__init__() 17 | self.in_planes = [512,256,256,256,256,512,1024] 18 | self.loc_layers = nn.ModuleList() 19 | self.conf_layers = nn.ModuleList() 20 | for i in range(len(self.in_planes)): 21 | self.loc_layers.append(nn.Conv2d(self.in_planes[i], self.num_anchors[i]*4, kernel_size=3, padding=1)) 22 | self.conf_layers.append(nn.Conv2d(self.in_planes[i], self.num_anchors[i]*13, kernel_size=3, padding=1)) 23 | 24 | def forward(self, xs): 25 | ''' 26 | Args: 27 | xs: (list) of tensor containing intermediate layer outputs. 28 | 29 | Returns: 30 | loc_preds: (tensor) predicted locations, sized [N,8732,4]. 31 | conf_preds: (tensor) predicted class confidences, sized [N,8732,21]. 32 | ''' 33 | y_locs = [] 34 | y_confs = [] 35 | for i, x in enumerate(xs): 36 | y_loc = self.loc_layers[i](x) 37 | N = y_loc.size(0) 38 | y_loc = y_loc.permute(0,2,3,1).contiguous() 39 | y_loc = y_loc.view(N,-1,4) 40 | y_locs.append(y_loc) 41 | 42 | y_conf = self.conf_layers[i](x) 43 | y_conf = y_conf.permute(0,2,3,1).contiguous() 44 | y_conf = y_conf.view(N,-1,13) 45 | y_confs.append(y_conf) 46 | 47 | loc_preds = torch.cat(y_locs, 1) 48 | conf_preds = torch.cat(y_confs, 1) 49 | return loc_preds, conf_preds 50 | -------------------------------------------------------------------------------- /MDSSD_augment/scripts/multibox_loss.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import math 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.init as init 8 | import torch.nn.functional as F 9 | 10 | 11 | class MultiBoxLoss(nn.Module): 12 | num_classes = 13 13 | 14 | def __init__(self): 15 | super(MultiBoxLoss, self).__init__() 16 | 17 | def cross_entropy_loss(self, x, y): 18 | '''Cross entropy loss w/o averaging across all samples. 19 | 20 | Args: 21 | x: (tensor) sized [N,D]. 22 | y: (tensor) sized [N,]. 23 | 24 | Return: 25 | (tensor) cross entroy loss, sized [N,]. 26 | ''' 27 | xmax = x.data.max() 28 | print('x y size {} {}'.format(x.size(), y.size())) 29 | log_sum_exp = torch.log(torch.sum(torch.exp(x-xmax), 1)) + xmax 30 | print('log_sum_exp {}'.format(log_sum_exp.size())) 31 | return log_sum_exp - x.gather(1, y.view(-1,1)) 32 | 33 | def hard_negative_mining(self, conf_loss, pos): 34 | '''Return negative indices that is 3x the number as postive indices. 35 | 36 | Args: 37 | conf_loss: (tensor) cross entroy loss between conf_preds and conf_targets, sized [N*8732,]. 38 | pos: (tensor) positive(matched) box indices, sized [N,8732]. 39 | 40 | Return: 41 | (tensor) negative indices, sized [N,8732]. 42 | ''' 43 | batch_size, num_boxes = pos.size() 44 | conf_loss[pos.view(-1)] = 0 # set pos boxes = 0, the rest are neg conf_loss 45 | conf_loss = conf_loss.view(batch_size, -1) # [N,8732] 46 | 47 | _,idx = conf_loss.sort(1, descending=True) # sort by neg conf_loss 48 | _,rank = idx.sort(1) # [N,8732] 49 | 50 | num_pos = pos.long().sum(1) # [N,1] 51 | num_neg = torch.clamp(3*num_pos, max=num_boxes-1) # [N,1] 52 | 53 | neg = rank < num_neg.unsqueeze(1).expand_as(rank) 54 | 55 | return neg 56 | 57 | def forward(self, loc_preds, loc_targets, conf_preds, conf_targets): 58 | '''Compute loss between (loc_preds, loc_targets) and (conf_preds, conf_targets). 59 | 60 | Args: 61 | loc_preds: (tensor) predicted locations, sized [batch_size, 8732, 4]. 62 | loc_targets: (tensor) encoded target locations, sized [batch_size, 8732, 4]. 63 | conf_preds: (tensor) predicted class confidences, sized [batch_size, 8732, num_classes]. 64 | conf_targets: (tensor) encoded target classes, sized [batch_size, 8732]. 65 | 66 | loss: 67 | (tensor) loss = SmoothL1Loss(loc_preds, loc_targets) + CrossEntropyLoss(conf_preds, conf_targets). 68 | ''' 69 | batch_size, num_boxes, _ = loc_preds.size() 70 | pos = conf_targets > 0 # [N,8732], pos means the box matched. 71 | num_matched_boxes = pos.data.float().sum() 72 | if num_matched_boxes == 0: 73 | return torch.tensor([0.], requires_grad=True) 74 | 75 | ################################################################ 76 | # loc_loss = SmoothL1Loss(pos_loc_preds, pos_loc_targets) 77 | ################################################################ 78 | pos_mask = pos.unsqueeze(2).expand_as(loc_preds) # [N,8732,4] 79 | pos_loc_preds = loc_preds[pos_mask].view(-1,4) # [#pos,4] 80 | pos_loc_targets = loc_targets[pos_mask].view(-1,4) # [#pos,4] 81 | 82 | loc_loss = F.smooth_l1_loss(pos_loc_preds, pos_loc_targets, size_average=False) 83 | 84 | ################################################################ 85 | # conf_loss = CrossEntropyLoss(pos_conf_preds, pos_conf_targets) 86 | # + CrossEntropyLoss(neg_conf_preds, neg_conf_targets) 87 | ################################################################ 88 | conf_preds = conf_preds.contiguous() 89 | conf_loss = F.cross_entropy(conf_preds.view(-1,self.num_classes), \ 90 | conf_targets.view(-1), reduce=False) # [N*8732,] 91 | neg = self.hard_negative_mining(conf_loss, pos) # [N,8732] 92 | 93 | pos_mask = pos.unsqueeze(2).expand_as(conf_preds) # [N,8732,21] 94 | neg_mask = neg.unsqueeze(2).expand_as(conf_preds) # [N,8732,21] 95 | mask = (pos_mask+neg_mask).gt(0) 96 | 97 | pos_and_neg = (pos+neg).gt(0) 98 | preds = conf_preds[mask].view(-1,self.num_classes) # [#pos+#neg,21] 99 | targets = conf_targets[pos_and_neg] # [#pos+#neg,] 100 | conf_loss = F.cross_entropy(preds, targets, size_average=False) 101 | 102 | loc_loss /= num_matched_boxes 103 | conf_loss /= num_matched_boxes 104 | 105 | return loc_loss + conf_loss 106 | -------------------------------------------------------------------------------- /MDSSD_augment/scripts/norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.nn.init as init 5 | 6 | class L2Norm(nn.Module): 7 | '''L2Norm layer across all channels and scale.''' 8 | def __init__(self, in_features,scale): 9 | super(L2Norm, self).__init__() 10 | self.weight = nn.Parameter(torch.Tensor(in_features)) 11 | self.reset_parameters(scale) 12 | 13 | def reset_parameters(self, scale): 14 | nn.init.constant_(self.weight, scale) 15 | 16 | def forward(self, x): 17 | x = F.normalize(x, dim=1) 18 | scale = self.weight[None,:,None,None] 19 | return scale * x -------------------------------------------------------------------------------- /MDSSD_augment/scripts/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | import torch.nn.functional as F 4 | import torchvision.transforms as transforms 5 | 6 | import sys 7 | from mdssd import MDSSD300 8 | from encoder import DataEncoder 9 | import cv2 10 | 11 | VOC_LABELS = ( 12 | 'ignored regions', 13 | 'pedestrian', 14 | 'people', 15 | 'bicycle', 16 | 'car', 17 | 'van', 18 | 'truck', 19 | 'tricycle', 20 | 'awning-tricycle', 21 | 'bus', 22 | 'motor', 23 | 'other' 24 | ) 25 | 26 | 27 | # Load model 28 | net = MDSSD300() 29 | checkpoint = torch.load('./checkpoint/ckpt.pth') 30 | 31 | keys = [] 32 | for k,v in checkpoint['net'].items(): 33 | if "module" in k: 34 | keys.append(k) 35 | for i in keys: 36 | checkpoint['net'][i[7:]] = checkpoint['net'][i] 37 | del checkpoint['net'][i] 38 | 39 | net.load_state_dict(checkpoint['net']) 40 | net.eval() 41 | 42 | if len(sys.argv) == 2: 43 | img_path = sys.argv[1] 44 | else: 45 | img_path = './images/img5.jpg' 46 | # Load test image 47 | img = cv2.imread(img_path) 48 | 49 | img1 = cv2.resize(img, (300, 300)) 50 | transform = transforms.Compose([transforms.ToTensor(), 51 | transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))]) 52 | img1 = transform(img1) 53 | 54 | # Forward 55 | with torch.no_grad(): 56 | x = torch.tensor(img1) 57 | loc_preds, conf = net(x.unsqueeze(0)) 58 | # Decode 59 | data_encoder = DataEncoder() 60 | boxes, labels, scores = data_encoder.decode(loc_preds.data.squeeze(0), F.softmax(conf.squeeze(0), dim=1).data) 61 | for box, label, score in zip(boxes, labels, scores): 62 | for b, s in zip(box, score): 63 | if s > 0.25:#0.7: 64 | b[::2] *= img.shape[1] 65 | b[1::2] *= img.shape[0] 66 | print('label:',VOC_LABELS[int(label[0])], 'score:', score) 67 | b = list(b) 68 | cv2.rectangle(img, (b[0], b[1]), (b[2], b[3]), (255, 255, 255), 2) 69 | title = '{}: {}'.format(VOC_LABELS[int(label[0])], round(float(score[0]), 2)) 70 | cv2.putText(img, title, (b[0], b[1]), cv2.FONT_ITALIC, 0.6, (0, 255, 0), 2) 71 | cv2.imshow('img', img) 72 | cv2.waitKey(0) -------------------------------------------------------------------------------- /MDSSD_augment/scripts/train.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | import argparse 5 | import itertools 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.optim as optim 10 | import torch.nn.functional as F 11 | import torch.backends.cudnn as cudnn 12 | 13 | import torchvision 14 | import torchvision.transforms as transforms 15 | 16 | import numpy as np 17 | from mdssd import MDSSD300 18 | from datagen import ListDataset 19 | from multibox_loss import MultiBoxLoss 20 | 21 | TRAIN_IMG_DIR = '../../../../VisDrone2019/dataset/VisDrone2018-DET-train/images/' 22 | TRAIN_ANNOT_DIR = '../../../../VisDrone2019/dataset/VisDrone2018-DET-train/annotations/' 23 | VAL_IMAGE_DIR = '../../../../VisDrone2019/dataset/VisDrone2019-DET-val/images/' 24 | VAL_ANNOT_DIR = '../../../../VisDrone2019/dataset/VisDrone2019-DET-val/annotations/' 25 | 26 | lr = 0.001 27 | resume = False # Resume from checkpoint 28 | epoch = 200 29 | batch_size = 8 30 | 31 | use_cuda = torch.cuda.is_available() 32 | best_loss = float('inf') # best test loss 33 | start_epoch = 0 # start from epoch 0 or last epoch 34 | 35 | # Data 36 | print('==> Preparing data..') 37 | transform = transforms.Compose([transforms.ToTensor(), 38 | transforms.Normalize(mean=(0.356, 0.368, 0.362), std=(0.242, 0.235, 0.236))]) 39 | 40 | trainset = ListDataset(root=TRAIN_IMG_DIR, list_file=TRAIN_ANNOT_DIR, train=True, transform=transform) 41 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=8, shuffle=True, num_workers=4) 42 | 43 | valset = ListDataset(root=VAL_IMAGE_DIR, list_file=VAL_ANNOT_DIR, train=True, transform=transform) 44 | valloader = torch.utils.data.DataLoader(valset, batch_size=8, shuffle=True, num_workers=4) 45 | 46 | # Model 47 | net = MDSSD300() 48 | if resume: 49 | print('==> Resuming from checkpoint..') 50 | checkpoint = torch.load('./checkpoint/ckpt.pth') 51 | 52 | keys = [] 53 | for k,v in checkpoint['net'].items(): 54 | if "module" in k: 55 | keys.append(k) 56 | for i in keys: 57 | checkpoint['net'][i[7:]] = checkpoint['net'][i] 58 | del checkpoint['net'][i] 59 | 60 | net.load_state_dict(checkpoint['net']) 61 | best_loss = checkpoint['loss'] 62 | start_epoch = checkpoint['epoch'] 63 | else: 64 | # Convert from pretrained VGG model. 65 | try: 66 | net.load_state_dict(torch.load('../model/ssd.pth')) 67 | print('==> Pretrain model read successfully') 68 | except: 69 | print('==> Pretrain model read failed or not existed, training from init') 70 | 71 | criterion = MultiBoxLoss() 72 | 73 | if use_cuda: 74 | net = torch.nn.DataParallel(net, device_ids=[0]) 75 | net.cuda() 76 | cudnn.benchmark = True 77 | 78 | optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4) 79 | # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.25) 80 | 81 | # Training 82 | def train(epoch,prev_val_loss, last_saved): 83 | print('\nEpoch: %d' % epoch) 84 | net.train() 85 | train_loss = 0 86 | for batch_idx, (images, loc_targets, conf_targets) in enumerate(trainloader): 87 | if use_cuda: 88 | images = images.cuda() 89 | loc_targets = loc_targets.cuda() 90 | conf_targets = conf_targets.cuda() 91 | 92 | images = torch.tensor(images) 93 | loc_targets = torch.tensor(loc_targets) 94 | conf_targets = torch.tensor(conf_targets) 95 | 96 | optimizer.zero_grad() 97 | loc_preds, conf_preds = net(images) 98 | loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets) 99 | loss.backward() 100 | optimizer.step() 101 | # scheduler.step() 102 | train_loss += loss.item() 103 | 104 | if batch_idx%100 == 0: 105 | val_loss_tot = 0 106 | for batch_idx_val, (images, loc_targets, conf_targets) in enumerate(valloader): 107 | if use_cuda: 108 | images = images.cuda() 109 | loc_targets = loc_targets.cuda() 110 | conf_targets = conf_targets.cuda() 111 | 112 | images = torch.tensor(images) 113 | loc_targets = torch.tensor(loc_targets) 114 | conf_targets = torch.tensor(conf_targets) 115 | 116 | loc_preds, conf_preds = net(images) 117 | val_loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets) 118 | val_loss_tot += val_loss.item() 119 | 120 | val_loss_tot /= (batch_idx_val+1) 121 | if val_loss_tot < prev_val_loss: 122 | os.makedirs('checkpoint', exist_ok=True) 123 | torch.save({ 124 | 'epoch': epoch, 125 | 'net': net.state_dict(), 126 | 'loss': loss, 127 | }, 'checkpoint/ckpt.pth') 128 | print("Saved.") 129 | prev_val_loss = val_loss_tot 130 | last_saved = [epoch, batch_idx] 131 | # print('epoch: {}, batch_idx: {},loss: {}, train_loss: {}'.format(epoch, batch_idx, loss.item(), train_loss/(batch_idx+1))) 132 | print('epoch: {}, batch_idx: {},loss: {}, train_loss: {}, best_val_loss: {}, last_saved: {}'.format(epoch, batch_idx, loss.item(), train_loss/(batch_idx+1), prev_val_loss, last_saved)) 133 | 134 | return prev_val_loss, last_saved 135 | 136 | 137 | prev_val_loss = 999 138 | last_saved = [start_epoch,0] 139 | for epoch_num in range(start_epoch, start_epoch+epoch): 140 | prev_val_loss, last_saved = train(epoch_num, prev_val_loss, last_saved) 141 | -------------------------------------------------------------------------------- /MDSSD_with_self_attention/scripts/attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.nn.init as init 5 | 6 | class AttentionBlock(nn.Module): 7 | def __init__(self, channels): 8 | super(AttentionBlock, self).__init__() 9 | 10 | self.f = nn.Conv2d(channels, channels, kernel_size=3, padding=1, dilation=1) 11 | self.g = nn.Conv2d(channels, channels, kernel_size=3, padding=1, dilation=1) 12 | self.h = nn.Conv2d(channels, channels, kernel_size=3, padding=1, dilation=1) 13 | 14 | def forward(self, x): 15 | f = self.f(x) 16 | f = torch.transpose(f,-2,-1) 17 | g = self.g(x) 18 | h = self.h(x) 19 | 20 | attention_map = torch.mul(f,g) 21 | out = torch.mul(h, attention_map) 22 | return out 23 | 24 | 25 | if __name__ == "__main__": 26 | x = torch.rand(1,3,300,300) 27 | att = AttentionBlock(3) 28 | x = att(x) 29 | print(x.size()) 30 | -------------------------------------------------------------------------------- /MDSSD_with_self_attention/scripts/datagen.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Load image/class/box from a annotation file. 3 | 4 | ''' 5 | from __future__ import print_function 6 | 7 | import os 8 | import sys 9 | import os.path 10 | 11 | import random 12 | import numpy as np 13 | 14 | import torch 15 | import torch.utils.data as data 16 | import torchvision.transforms as transforms 17 | 18 | from encoder import DataEncoder 19 | import cv2 20 | 21 | import pandas as pd 22 | import shutil 23 | import os 24 | import numpy as np 25 | import glob 26 | import xml.etree.ElementTree as ET 27 | from xml.dom import minidom 28 | 29 | class ListDataset(data.Dataset): 30 | img_size = 300 31 | 32 | def __init__(self, root, list_file, train, transform): 33 | ''' 34 | Args: 35 | root: (str) ditectory to images. 36 | list_file: (str) path to annotation files. 37 | train: (boolean) train or test. 38 | transform: ([transforms]) image transforms. 39 | ''' 40 | self.root = root 41 | self.list_file = list_file 42 | self.train = train 43 | self.transform = transform 44 | 45 | self.fnames = [] 46 | self.boxes = [] 47 | self.labels = [] 48 | 49 | self.data_encoder = DataEncoder() 50 | self.num_samples = 0 51 | 52 | # VisDrone 53 | 54 | for i in os.listdir(list_file): 55 | self.num_samples += 1 56 | self.fnames.append(i) 57 | 58 | def __getitem__(self, idx): 59 | '''Load a image, and encode its bbox locations and class labels. 60 | Args: 61 | idx: (int) image index. 62 | Returns: 63 | img: (tensor) image tensor. 64 | loc_target: (tensor) location targets, sized [8732,4]. 65 | conf_target: (tensor) label targets, sized [8732,]. 66 | ''' 67 | # Load image and bbox locations. 68 | fname = self.fnames[idx] 69 | img = cv2.imread(os.path.join(self.root, fname[:-4]+".jpg")) 70 | 71 | box = [] 72 | label = [] 73 | with open(os.path.join(self.list_file,fname)) as f: 74 | f = f.read().split("\n") 75 | f = f[:-1] 76 | num_objs = len(f) 77 | 78 | for j in range(num_objs): 79 | f[j] = f[j].split(",") 80 | xmin = float(f[j][0]) 81 | ymin = float(f[j][1]) 82 | w = float(f[j][2]) 83 | h = float(f[j][3]) 84 | 85 | box.append([xmin,ymin,xmin+w,ymin+h]) 86 | label.append(int(f[j][5])) 87 | 88 | # **************************** AUGMENTATION ************************************ 89 | # Copy and paste small objects at random locations in 90 | # image to increase the number of samples with small sizes. 91 | box_new = box.copy() 92 | label_new = label.copy() 93 | img_new = img.copy() 94 | for n in range(len(box)): 95 | j = box[n] 96 | if j[2]*j[3]<500: 97 | crop = img[int(j[1]):int(j[1]+j[3]),int(j[0]):int(j[0]+j[2])] 98 | x = random.randrange(0, img.shape[1],1) 99 | y = random.randrange(0, img.shape[0],1) 100 | 101 | try: 102 | img_new[int(y):int(y+j[3]),int(x):int(x+j[2])] = crop 103 | box_new.append([x,y,j[2],j[3]]) 104 | label_new.append(label[n]) 105 | except: 106 | continue 107 | 108 | # ******************************************************************************** 109 | 110 | self.boxes.append(torch.Tensor(box_new)) 111 | self.labels.append(torch.LongTensor(label_new)) 112 | img = img_new 113 | 114 | boxes = self.boxes[-1].clone() 115 | labels = self.labels[-1] 116 | 117 | # Data augmentation while training. 118 | if self.train: 119 | img, boxes = self.random_flip(img, boxes) 120 | img, boxes, labels = self.random_crop(img, boxes, labels) 121 | 122 | # Scale bbox locaitons to [0,1]. 123 | w,h = img.shape[1], img.shape[0] 124 | boxes /= torch.Tensor([w,h,w,h]).expand_as(boxes) 125 | img = cv2.resize(img, (self.img_size,self.img_size)) 126 | img = self.transform(img) 127 | 128 | # Encode loc & conf targets. 129 | 130 | loc_target, conf_target = self.data_encoder.encode(boxes, labels) 131 | return img, loc_target, conf_target 132 | 133 | def random_flip(self, img, boxes): 134 | '''Randomly flip the image and adjust the bbox locations. 135 | For bbox (xmin, ymin, xmax, ymax), the flipped bbox is: 136 | (w-xmax, ymin, w-xmin, ymax). 137 | Args: 138 | img: (ndarray.Image) image. f 139 | boxes: (tensor) bbox locations, sized [#obj, 4]. 140 | Returns: 141 | img: (ndarray.Image) randomly flipped image. 142 | boxes: (tensor) randomly flipped bbox locations, sized [#obj, 4]. 143 | ''' 144 | if random.random() < 0.5: 145 | img = cv2.flip(img, 1) 146 | w = img.shape[1] 147 | xmin = w - boxes[:,2] 148 | xmax = w - boxes[:,0] 149 | boxes[:,0] = xmin 150 | boxes[:,2] = xmax 151 | return img, boxes 152 | 153 | def random_crop(self, img, boxes, labels): 154 | '''Randomly crop the image and adjust the bbox locations. 155 | For more details, see 'Chapter2.2: Data augmentation' of the paper. 156 | Args: 157 | img: (ndarray.Image) image. 158 | boxes: (tensor) bbox locations, sized [#obj, 4]. 159 | labels: (tensor) bbox labels, sized [#obj,]. 160 | Returns: 161 | img: (ndarray.Image) cropped image. 162 | selected_boxes: (tensor) selected bbox locations. 163 | labels: (tensor) selected bbox labels. 164 | ''' 165 | imw, imh = img.shape[1], img.shape[0] 166 | while True: 167 | min_iou = random.choice([None, 0.1, 0.3, 0.5, 0.7, 0.9])# random choice the one 168 | if min_iou is None: 169 | return img, boxes, labels 170 | 171 | for _ in range(100): 172 | w = random.randrange(int(0.1*imw), imw) 173 | h = random.randrange(int(0.1*imh), imh) 174 | 175 | if h > 2*w or w > 2*h or h < 1 or w < 1: 176 | continue 177 | 178 | x = random.randrange(imw - w) 179 | y = random.randrange(imh - h) 180 | roi = torch.Tensor([[x, y, x+w, y+h]]) 181 | 182 | center = (boxes[:,:2] + boxes[:,2:]) / 2 # [N,2] 183 | roi2 = roi.expand(len(center), 4) # [N,4] 184 | 185 | mask = (center > roi2[:,:2]) & (center < roi2[:,2:]) # [N,2] 186 | mask = mask[:,0] & mask[:,1] #[N,] 187 | 188 | if not mask.any(): 189 | continue 190 | 191 | selected_boxes = boxes.index_select(0, mask.nonzero().squeeze(1)) 192 | 193 | iou = self.data_encoder.iou(selected_boxes, roi) 194 | if iou.min() < min_iou: 195 | continue 196 | img = img[y:y+h, x:x+w, :] 197 | 198 | selected_boxes[:,0].add_(-x).clamp_(min=0, max=w) 199 | selected_boxes[:,1].add_(-y).clamp_(min=0, max=h) 200 | selected_boxes[:,2].add_(-x).clamp_(min=0, max=w) 201 | selected_boxes[:,3].add_(-y).clamp_(min=0, max=h) 202 | 203 | return img, selected_boxes, labels[mask] 204 | 205 | def __len__(self): 206 | return self.num_samples -------------------------------------------------------------------------------- /MDSSD_with_self_attention/scripts/encoder.py: -------------------------------------------------------------------------------- 1 | '''Encode target locations and labels.''' 2 | import torch 3 | 4 | import math 5 | import itertools 6 | 7 | class DataEncoder: 8 | def __init__(self): 9 | '''Compute default box sizes with scale and aspect transform.''' 10 | scale = 300. 11 | steps = [s / scale for s in (32, 64, 100, 300, 4, 8, 16)] 12 | sizes_ssd = [s / scale for s in (111, 162, 213, 264, 315)] 13 | sizes_fusion = [s / scale for s in (15, 30, 60, 111)] 14 | aspect_ratios = ((2,3), (2,3), (2,), (2,), (2,), (2,), (2,)) 15 | feature_map_sizes = (10, 5, 3, 1, 75, 38, 19) 16 | num_layers = len(feature_map_sizes) 17 | 18 | boxes = [] 19 | for i in range(num_layers): 20 | fmsize = feature_map_sizes[i] # feature map size 21 | for h,w in itertools.product(range(fmsize), repeat=2): 22 | cx = (w + 0.5)*steps[i] 23 | cy = (h + 0.5)*steps[i] 24 | 25 | if i<4: 26 | s = sizes_ssd[i] 27 | boxes.append((cx, cy, s, s)) 28 | 29 | s = math.sqrt(sizes_ssd[i] * sizes_ssd[i+1]) 30 | boxes.append((cx, cy, s, s)) 31 | 32 | s = sizes_ssd[i] 33 | for ar in aspect_ratios[i]: 34 | boxes.append((cx, cy, s * math.sqrt(ar), s / math.sqrt(ar))) 35 | boxes.append((cx, cy, s / math.sqrt(ar), s * math.sqrt(ar))) 36 | 37 | else: 38 | s = sizes_fusion[i-4] 39 | boxes.append((cx, cy, s, s)) 40 | 41 | s = math.sqrt(sizes_fusion[i-4] * sizes_fusion[i-4+1]) 42 | boxes.append((cx, cy, s, s)) 43 | 44 | s = sizes_fusion[i-4] 45 | for ar in aspect_ratios[i]: 46 | boxes.append((cx, cy, s * math.sqrt(ar), s / math.sqrt(ar))) 47 | boxes.append((cx, cy, s / math.sqrt(ar), s * math.sqrt(ar))) 48 | 49 | self.default_boxes = torch.Tensor(boxes) 50 | 51 | def iou(self, box1, box2): 52 | '''Compute the intersection over union of two set of boxes, each box is [x1,y1,x2,y2]. 53 | 54 | Args: 55 | box1: (tensor) bounding boxes, sized [N,4]. 56 | box2: (tensor) bounding boxes, sized [M,4]. 57 | 58 | Return: 59 | (tensor) iou, sized [N,M]. 60 | ''' 61 | N = box1.size(0) 62 | M = box2.size(0) 63 | 64 | lt = torch.max( 65 | box1[:,:2].unsqueeze(1).expand(N,M,2), # [N,2] -> [N,1,2] -> [N,M,2] 66 | box2[:,:2].unsqueeze(0).expand(N,M,2), # [M,2] -> [1,M,2] -> [N,M,2] 67 | ) 68 | 69 | rb = torch.min( 70 | box1[:,2:].unsqueeze(1).expand(N,M,2), # [N,2] -> [N,1,2] -> [N,M,2] 71 | box2[:,2:].unsqueeze(0).expand(N,M,2), # [M,2] -> [1,M,2] -> [N,M,2] 72 | ) 73 | 74 | wh = rb - lt # [N,M,2] 75 | wh[wh<0] = 0 # clip at 0 76 | inter = wh[:,:,0] * wh[:,:,1] # [N,M] 77 | 78 | area1 = (box1[:,2]-box1[:,0]) * (box1[:,3]-box1[:,1]) # [N,] 79 | area2 = (box2[:,2]-box2[:,0]) * (box2[:,3]-box2[:,1]) # [M,] 80 | area1 = area1.unsqueeze(1).expand_as(inter) # [N,] -> [N,1] -> [N,M] 81 | area2 = area2.unsqueeze(0).expand_as(inter) # [M,] -> [1,M] -> [N,M] 82 | 83 | iou = inter / (area1 + area2 - inter) 84 | return iou 85 | 86 | def encode(self, boxes, classes, threshold=0.5): 87 | '''Transform target bounding boxes and class labels to SSD boxes and classes. 88 | 89 | Match each object box to all the default boxes, pick the ones with the 90 | Jaccard-Index > 0.5: 91 | Jaccard(A,B) = AB / (A+B-AB) 92 | 93 | Args: 94 | boxes: (tensor) object bounding boxes (xmin,ymin,xmax,ymax) of a image, sized [#obj, 4]. 95 | classes: (tensor) object class labels of a image, sized [#obj,]. 96 | threshold: (float) Jaccard index threshold 97 | 98 | Returns: 99 | boxes: (tensor) bounding boxes, sized [#obj, 8732, 4]. 100 | classes: (tensor) class labels, sized [8732,] 101 | ''' 102 | default_boxes = self.default_boxes 103 | num_default_boxes = default_boxes.size(0) 104 | num_objs = boxes.size(0) 105 | 106 | iou = self.iou( # [#obj,8732] 107 | boxes, 108 | torch.cat([default_boxes[:,:2] - default_boxes[:,2:]/2, 109 | default_boxes[:,:2] + default_boxes[:,2:]/2], 1) 110 | ) 111 | 112 | iou, max_idx = iou.max(0) # [1,8732] 113 | max_idx.squeeze_(0) # [8732,] 114 | iou.squeeze_(0) # [8732,] 115 | 116 | boxes = boxes[max_idx] # [8732,4] 117 | variances = [0.1, 0.2] 118 | cxcy = (boxes[:,:2] + boxes[:,2:])/2 - default_boxes[:,:2] # [8732,2] 119 | cxcy /= variances[0] * default_boxes[:,2:] 120 | wh = (boxes[:,2:] - boxes[:,:2]) / default_boxes[:,2:] # [8732,2] 121 | wh = torch.log(wh) / variances[1] 122 | loc = torch.cat([cxcy, wh], 1) # [8732,4] 123 | 124 | conf = 1 + classes[max_idx] # [8732,], background class = 0 125 | conf[iou 0: 153 | try: 154 | i = order[0] 155 | except: 156 | i = order 157 | keep.append(i) 158 | 159 | if order.size == 1: 160 | break 161 | 162 | xx1 = x1[order[1:]].clamp(min=x1[i]) 163 | yy1 = y1[order[1:]].clamp(min=y1[i]) 164 | xx2 = x2[order[1:]].clamp(max=x2[i]) 165 | yy2 = y2[order[1:]].clamp(max=y2[i]) 166 | 167 | w = (xx2-xx1).clamp(min=0) 168 | h = (yy2-yy1).clamp(min=0) 169 | inter = w*h 170 | 171 | if mode == 'union': 172 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 173 | elif mode == 'min': 174 | ovr = inter / areas[order[1:]].clamp(max=areas[i]) 175 | else: 176 | raise TypeError('Unknown nms mode: %s.' % mode) 177 | 178 | ids = (ovr<=threshold).nonzero().squeeze() 179 | if ids.size == 0: 180 | break 181 | order = order[ids+1] 182 | return torch.LongTensor(keep) 183 | 184 | def decode(self, loc, conf): 185 | '''Transform predicted loc/conf back to real bbox locations and class labels. 186 | 187 | Args: 188 | loc: (tensor) predicted loc, sized [8732,4]. 189 | conf: (tensor) predicted conf, sized [8732,21]. 190 | 191 | Returns: 192 | boxes: (tensor) bbox locations, sized [#obj, 4]. 193 | labels: (tensor) class labels, sized [#obj,1]. 194 | ''' 195 | variances = (0.1, 0.2) 196 | wh = torch.exp(loc[:,2:]*variances[1]) * self.default_boxes[:,2:] 197 | cxcy = loc[:,:2] * variances[0] * self.default_boxes[:,2:] + self.default_boxes[:,:2] 198 | box_preds = torch.cat([cxcy-wh/2, cxcy+wh/2], 1) # [8732,4] 199 | 200 | boxes = [] 201 | labels = [] 202 | scores = [] 203 | num_classes = conf.size(1) 204 | for i in range(num_classes-1): 205 | score = conf[:,i+1] # class i corresponds to (i+1) column 206 | mask = score > 0.1 207 | 208 | if not mask.any(): 209 | continue 210 | 211 | box = box_preds[mask.nonzero().squeeze()] 212 | score = score[mask] 213 | 214 | if len(score) == 1: 215 | continue 216 | keep = self.nms(box, score, threshold=0.3) 217 | boxes.append(box[keep]) 218 | labels.append(torch.LongTensor(len(box[keep])).fill_(i)) 219 | scores.append(score[keep]) 220 | 221 | return boxes, labels, scores 222 | -------------------------------------------------------------------------------- /MDSSD_with_self_attention/scripts/fusion.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.nn.init as init 5 | 6 | from torch.autograd import Variable 7 | 8 | from norm import L2Norm 9 | 10 | class FusionBlock(nn.Module): 11 | def __init__(self, big_features, small_features): 12 | super(FusionBlock, self).__init__() 13 | 14 | # Bigger feature map 15 | self.conv1_1 = nn.Conv2d(big_features, 256, kernel_size=3, padding=1, dilation=1) 16 | self.Norm1 = L2Norm(256, 20) 17 | 18 | # Smaller feature map 19 | self.deconv2_1 = nn.ConvTranspose2d(small_features, 256, 2, stride=2, dilation=1) 20 | self.conv2_1 = nn.Conv2d(256, 256, kernel_size=3, padding=1, dilation=1) 21 | self.bn2_1 = nn.BatchNorm2d(256) 22 | self.deconv2_2 = nn.ConvTranspose2d(256, 256, 2, stride=2, dilation=1) 23 | self.conv2_2 = nn.Conv2d(256, 256, kernel_size=3, padding=1, dilation=1) 24 | self.bn2_2 = nn.BatchNorm2d(256) 25 | self.deconv2_3 = nn.ConvTranspose2d(256, 256, 3, stride=2, dilation=1) 26 | self.conv2_3 = nn.Conv2d(256, 256, kernel_size=3, padding=1, dilation=1) 27 | self.Norm2 = L2Norm(256, 20) 28 | 29 | # Common 30 | self.conv3_1 = nn.Conv2d(256, big_features, kernel_size=3, padding=1, dilation=1) 31 | 32 | 33 | def forward(self, big, small): 34 | h1 = self.conv1_1(big) 35 | h1 = self.Norm1(h1) 36 | 37 | h2 = self.deconv2_1(small) 38 | # print(h2.size()) 39 | h2 = F.relu(self.bn2_1(self.conv2_1(h2))) 40 | # print(h2.size()) 41 | h2 = self.deconv2_2(h2) 42 | # print(h2.size()) 43 | h2 = F.relu(self.bn2_2(self.conv2_2(h2))) 44 | # print(h2.size()) 45 | h2 = self.deconv2_3(h2) 46 | # print(h2.size()) 47 | h2 = self.conv2_3(h2) 48 | # print(h2.size()) 49 | h2 = self.Norm2(h2) 50 | 51 | size = h2.size()[3] 52 | diff_odd = h2.size()[-1] - h1.size()[-1] 53 | h2 = h2[:,:,(int(diff_odd/2)+diff_odd%2):(size-int(diff_odd/2)),(int(diff_odd/2)+diff_odd%2):(size-int(diff_odd/2))] 54 | 55 | # print(h1.size(), h2.size()) 56 | h = F.relu(h1+h2) 57 | h = F.relu(self.conv3_1(h)) 58 | 59 | return h 60 | 61 | if __name__ == '__main__': 62 | big = torch.randn(1, 256, 128, 128) 63 | small = torch.rand(1,512,16,16) 64 | net = FusionBlock(256,512) 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /MDSSD_with_self_attention/scripts/gen_test_file.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | import torch.nn.functional as F 4 | import torchvision.transforms as transforms 5 | 6 | import sys 7 | from mdssd import MDSSD300 8 | from encoder import DataEncoder 9 | import cv2 10 | 11 | import pandas as pd 12 | import shutil 13 | import os 14 | import numpy as np 15 | import glob 16 | import xml.etree.ElementTree as ET 17 | from xml.dom import minidom 18 | 19 | TEST_DIR = '/home/siddhant/deeplearning/Dataset/VisDrone2019/VisDrone2019-DET-val/images/' 20 | TEST_ANNOT = '/home/siddhant/deeplearning/Dataset/VisDrone2019/VisDrone2019-DET-val/annotations/' 21 | 22 | LABELS = ( 23 | 'ignored regions', 24 | 'pedestrian', 25 | 'people', 26 | 'bicycle', 27 | 'car', 28 | 'van', 29 | 'truck', 30 | 'tricycle', 31 | 'awning-tricycle', 32 | 'bus', 33 | 'motor', 34 | 'other' 35 | ) 36 | 37 | def GT(annotation_file): 38 | # Load model 39 | net = MDSSD300() 40 | checkpoint = torch.load('./checkpoint/ckpt.pth') 41 | 42 | keys = [] 43 | for k,v in checkpoint['net'].items(): 44 | if "module" in k: 45 | keys.append(k) 46 | for i in keys: 47 | checkpoint['net'][i[7:]] = checkpoint['net'][i] 48 | del checkpoint['net'][i] 49 | 50 | net.load_state_dict(checkpoint['net']) 51 | net.eval() 52 | 53 | count = 0 54 | for i in os.listdir(annotation_file): 55 | count += 1 56 | print(count) 57 | with open(os.path.join(annotation_file,i)) as f: 58 | f = f.read().split("\n") 59 | f = f[:-1] 60 | num_objs = len(f) 61 | 62 | file = open(os.path.join("../test/gt/",i[:-4]+".txt"), "w") 63 | 64 | for j in range(num_objs): 65 | f[j] = f[j].split(",") 66 | label = int(f[j][5]) 67 | if label == 0: 68 | continue 69 | xmin = float(f[j][0]) 70 | ymin = float(f[j][1]) 71 | w = float(f[j][2]) 72 | h = float(f[j][3]) 73 | file.write(str(LABELS[label])+" "+str(int(xmin))+" "+str(int(ymin))+" "+str(int(xmin+w))+" "+str(int(ymin+h))+"\n") 74 | file.close() 75 | 76 | def detect(image_dir): 77 | # Load model 78 | net = MDSSD300() 79 | checkpoint = torch.load('./checkpoint/ckpt.pth') 80 | 81 | keys = [] 82 | for k,v in checkpoint['net'].items(): 83 | if "module" in k: 84 | keys.append(k) 85 | for i in keys: 86 | checkpoint['net'][i[7:]] = checkpoint['net'][i] 87 | del checkpoint['net'][i] 88 | 89 | net.load_state_dict(checkpoint['net']) 90 | net.eval() 91 | 92 | count = 0 93 | for i in os.listdir(image_dir): 94 | count += 1 95 | print(count) 96 | file = open("../test/detect/"+i[:-4]+".txt","w") 97 | img = cv2.imread(os.path.join(image_dir,i)) 98 | 99 | img1 = cv2.resize(img, (300, 300)) 100 | transform = transforms.Compose([transforms.ToTensor(), 101 | transforms.Normalize(mean=(0.356, 0.368, 0.362), std=(0.242, 0.235, 0.236))]) 102 | img1 = transform(img1) 103 | 104 | # Forward 105 | with torch.no_grad(): 106 | x = torch.tensor(img1) 107 | loc_preds, conf = net(x.unsqueeze(0)) 108 | # Decode 109 | data_encoder = DataEncoder() 110 | boxes, labels, scores = data_encoder.decode(loc_preds.data.squeeze(0), F.softmax(conf.squeeze(0), dim=1).data) 111 | 112 | for box, label, score in zip(boxes, labels, scores): 113 | for b, l, s in zip(box, label, score): 114 | # print(b,l,s) 115 | if l.item() == 0: 116 | continue 117 | b[::2] *= img.shape[1] 118 | b[1::2] *= img.shape[0] 119 | 120 | xmin = str(int(b[0].item())) 121 | ymin = str(int(b[1].item())) 122 | xmax = str(int(b[2].item())) 123 | ymax = str(int(b[3].item())) 124 | confidence = str(s.item()) 125 | label = str(LABELS[int(l.item())]) 126 | file.write(label+" "+confidence+" "+xmin+" "+ymin+" "+xmax+" "+ymax+"\n") 127 | 128 | file.close() 129 | 130 | 131 | if __name__ == "__main__": 132 | GT(TEST_ANNOT) 133 | detect(TEST_DIR) 134 | -------------------------------------------------------------------------------- /MDSSD_with_self_attention/scripts/mdssd.py: -------------------------------------------------------------------------------- 1 | import math 2 | import itertools 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torch.nn.init as init 8 | 9 | from torch.autograd import Variable 10 | 11 | from multibox_layer import MultiBoxLayer 12 | from fusion import FusionBlock 13 | from norm import L2Norm 14 | from attention import AttentionBlock 15 | 16 | class MDSSD300(nn.Module): 17 | input_size = 300 18 | 19 | def __init__(self): 20 | super(MDSSD300, self).__init__() 21 | 22 | # Attention 23 | self.attn1 = AttentionBlock(3) 24 | 25 | # model 26 | self.base = self.VGG16() 27 | self.norm4 = L2Norm(512, 20) # 38 28 | 29 | self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1) 30 | self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1) 31 | self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1) 32 | self.bn5 = nn.BatchNorm2d(512) 33 | 34 | self.conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) 35 | self.bn6 = nn.BatchNorm2d(1024) 36 | 37 | self.conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 38 | self.bn7 = nn.BatchNorm2d(1024) 39 | # self.attn2 = AttentionBlock(1024) 40 | 41 | self.conv8_1 = nn.Conv2d(1024, 256, kernel_size=1) 42 | self.conv8_2 = nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2) 43 | self.bn8 = nn.BatchNorm2d(512) 44 | 45 | self.conv9_1 = nn.Conv2d(512, 128, kernel_size=1) 46 | self.conv9_2 = nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2) 47 | self.bn9 = nn.BatchNorm2d(256) 48 | 49 | self.conv10_1 = nn.Conv2d(256, 128, kernel_size=1) 50 | self.conv10_2 = nn.Conv2d(128, 256, kernel_size=3) 51 | 52 | self.conv11_1 = nn.Conv2d(256, 128, kernel_size=1) 53 | self.conv11_2 = nn.Conv2d(128, 256, kernel_size=3) 54 | 55 | self.Fusion1 = FusionBlock(256,512) 56 | self.Fusion2 = FusionBlock(512,256) 57 | self.Fusion3 = FusionBlock(1024,256) 58 | 59 | # multibox layer 60 | self.multibox = MultiBoxLayer() 61 | 62 | def forward(self, x): 63 | odd_count = 0 64 | odd = [] 65 | hs = [] 66 | vgg = [] 67 | fusion_layers = [] 68 | 69 | h= self.attn1(x) 70 | 71 | h = self.base[0](h) 72 | vgg.append(h) 73 | for i in range(1,len(self.base)): 74 | h = self.base[i](h) 75 | vgg.append(h) 76 | fusion_layers.append(vgg[15]) 77 | odd.append(2) 78 | odd_count = 3 79 | fusion_layers.append(h) 80 | h = F.max_pool2d(h, kernel_size=2, stride=2, ceil_mode=True) 81 | 82 | h = F.relu(self.conv5_1(h)) 83 | h = F.relu(self.conv5_2(h)) 84 | h = F.relu(self.conv5_3(h)) 85 | h = F.max_pool2d(self.bn5(h), kernel_size=3, padding=1, stride=1, ceil_mode=True) 86 | 87 | h = F.relu(self.bn6(self.conv6(h))) 88 | h = F.relu(self.bn7(self.conv7(h))) 89 | fusion_layers.append(h) 90 | # h = self.attn2(h) 91 | 92 | h = F.relu(self.conv8_1(h)) 93 | h = F.relu(self.bn8(self.conv8_2(h))) 94 | hs.append(h) # conv8_2 95 | 96 | h = F.relu(self.conv9_1(h)) 97 | h = F.relu(self.bn9(self.conv9_2(h))) 98 | hs.append(h) # conv9_2 99 | 100 | h = F.relu(self.conv10_1(h)) 101 | h = F.relu(self.conv10_2(h)) 102 | hs.append(h) # conv10_2 103 | 104 | h = F.relu(self.conv11_1(h)) 105 | h = F.relu(self.conv11_2(h)) 106 | hs.append(h) # conv11_2 107 | 108 | # Fusion Blocks 109 | f = self.Fusion1(fusion_layers[0],hs[-4]) 110 | hs.append(f) 111 | f = self.Fusion2(fusion_layers[1],hs[-4]) 112 | hs.append(f) 113 | diff_odd = fusion_layers[2].size()[-1] - hs[-4].size()[-1] 114 | f = self.Fusion3(fusion_layers[2],hs[-4]) 115 | hs.append(f) 116 | 117 | loc_preds, conf_preds = self.multibox(hs) 118 | 119 | return loc_preds, conf_preds 120 | 121 | def VGG16(self): 122 | '''VGG16 layers.''' 123 | cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512] 124 | layers = [] 125 | in_channels = 3 126 | for x in cfg: 127 | if x == 'M': 128 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 129 | else: 130 | layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1), 131 | nn.ReLU(True)] 132 | in_channels = x 133 | return nn.Sequential(*layers) 134 | 135 | if __name__ == '__main__': 136 | t = torch.randn(1, 3, 300, 300) 137 | net = MDSSD300() 138 | # print(net) 139 | res = net.forward(t) 140 | -------------------------------------------------------------------------------- /MDSSD_with_self_attention/scripts/multibox_layer.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import math 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.init as init 8 | import torch.nn.functional as F 9 | 10 | 11 | class MultiBoxLayer(nn.Module): 12 | num_classes = 13 13 | num_anchors = [6,6,4,4,4,4,4] 14 | 15 | def __init__(self): 16 | super(MultiBoxLayer, self).__init__() 17 | self.in_planes = [512,256,256,256,256,512,1024] 18 | self.loc_layers = nn.ModuleList() 19 | self.conf_layers = nn.ModuleList() 20 | for i in range(len(self.in_planes)): 21 | self.loc_layers.append(nn.Conv2d(self.in_planes[i], self.num_anchors[i]*4, kernel_size=3, padding=1)) 22 | self.conf_layers.append(nn.Conv2d(self.in_planes[i], self.num_anchors[i]*13, kernel_size=3, padding=1)) 23 | 24 | def forward(self, xs): 25 | ''' 26 | Args: 27 | xs: (list) of tensor containing intermediate layer outputs. 28 | 29 | Returns: 30 | loc_preds: (tensor) predicted locations, sized [N,8732,4]. 31 | conf_preds: (tensor) predicted class confidences, sized [N,8732,21]. 32 | ''' 33 | y_locs = [] 34 | y_confs = [] 35 | for i, x in enumerate(xs): 36 | y_loc = self.loc_layers[i](x) 37 | N = y_loc.size(0) 38 | y_loc = y_loc.permute(0,2,3,1).contiguous() 39 | y_loc = y_loc.view(N,-1,4) 40 | y_locs.append(y_loc) 41 | 42 | y_conf = self.conf_layers[i](x) 43 | y_conf = y_conf.permute(0,2,3,1).contiguous() 44 | y_conf = y_conf.view(N,-1,13) 45 | y_confs.append(y_conf) 46 | 47 | loc_preds = torch.cat(y_locs, 1) 48 | conf_preds = torch.cat(y_confs, 1) 49 | return loc_preds, conf_preds 50 | -------------------------------------------------------------------------------- /MDSSD_with_self_attention/scripts/multibox_loss.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import math 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.init as init 8 | import torch.nn.functional as F 9 | 10 | 11 | class MultiBoxLoss(nn.Module): 12 | num_classes = 13 13 | 14 | def __init__(self): 15 | super(MultiBoxLoss, self).__init__() 16 | 17 | def cross_entropy_loss(self, x, y): 18 | '''Cross entropy loss w/o averaging across all samples. 19 | 20 | Args: 21 | x: (tensor) sized [N,D]. 22 | y: (tensor) sized [N,]. 23 | 24 | Return: 25 | (tensor) cross entroy loss, sized [N,]. 26 | ''' 27 | xmax = x.data.max() 28 | print('x y size {} {}'.format(x.size(), y.size())) 29 | log_sum_exp = torch.log(torch.sum(torch.exp(x-xmax), 1)) + xmax 30 | print('log_sum_exp {}'.format(log_sum_exp.size())) 31 | return log_sum_exp - x.gather(1, y.view(-1,1)) 32 | 33 | def hard_negative_mining(self, conf_loss, pos): 34 | '''Return negative indices that is 3x the number as postive indices. 35 | 36 | Args: 37 | conf_loss: (tensor) cross entroy loss between conf_preds and conf_targets, sized [N*8732,]. 38 | pos: (tensor) positive(matched) box indices, sized [N,8732]. 39 | 40 | Return: 41 | (tensor) negative indices, sized [N,8732]. 42 | ''' 43 | batch_size, num_boxes = pos.size() 44 | conf_loss[pos.view(-1)] = 0 # set pos boxes = 0, the rest are neg conf_loss 45 | conf_loss = conf_loss.view(batch_size, -1) # [N,8732] 46 | 47 | _,idx = conf_loss.sort(1, descending=True) # sort by neg conf_loss 48 | _,rank = idx.sort(1) # [N,8732] 49 | 50 | num_pos = pos.long().sum(1) # [N,1] 51 | num_neg = torch.clamp(3*num_pos, max=num_boxes-1) # [N,1] 52 | 53 | neg = rank < num_neg.unsqueeze(1).expand_as(rank) 54 | 55 | return neg 56 | 57 | def forward(self, loc_preds, loc_targets, conf_preds, conf_targets): 58 | '''Compute loss between (loc_preds, loc_targets) and (conf_preds, conf_targets). 59 | 60 | Args: 61 | loc_preds: (tensor) predicted locations, sized [batch_size, 8732, 4]. 62 | loc_targets: (tensor) encoded target locations, sized [batch_size, 8732, 4]. 63 | conf_preds: (tensor) predicted class confidences, sized [batch_size, 8732, num_classes]. 64 | conf_targets: (tensor) encoded target classes, sized [batch_size, 8732]. 65 | 66 | loss: 67 | (tensor) loss = SmoothL1Loss(loc_preds, loc_targets) + CrossEntropyLoss(conf_preds, conf_targets). 68 | ''' 69 | 70 | # loc_preds = loc_preds[:,:8732,:] 71 | # conf_preds = conf_preds[:,:8732,:] 72 | 73 | batch_size, num_boxes, _ = loc_preds.size() 74 | pos = conf_targets > 0 # [N,8732], pos means the box matched. 75 | # print(pos.size()) 76 | num_matched_boxes = pos.data.float().sum() 77 | if num_matched_boxes == 0: 78 | return torch.tensor([0.], requires_grad=True) 79 | 80 | ################################################################ 81 | # loc_loss = SmoothL1Loss(pos_loc_preds, pos_loc_targets) 82 | ################################################################ 83 | pos_mask = pos.unsqueeze(2).expand_as(loc_preds) # [N,8732,4] 84 | pos_loc_preds = loc_preds[pos_mask].view(-1,4) # [#pos,4] 85 | pos_loc_targets = loc_targets[pos_mask].view(-1,4) # [#pos,4] 86 | 87 | loc_loss = F.smooth_l1_loss(pos_loc_preds, pos_loc_targets, size_average=False) 88 | 89 | ################################################################ 90 | # conf_loss = CrossEntropyLoss(pos_conf_preds, pos_conf_targets) 91 | # + CrossEntropyLoss(neg_conf_preds, neg_conf_targets) 92 | ################################################################ 93 | conf_preds = conf_preds.contiguous() 94 | # print(conf_preds.size(), conf_targets.size()) 95 | conf_loss = F.cross_entropy(conf_preds.view(-1,self.num_classes), \ 96 | conf_targets.view(-1), reduce=False) # [N*8732,] 97 | neg = self.hard_negative_mining(conf_loss, pos) # [N,8732] 98 | 99 | pos_mask = pos.unsqueeze(2).expand_as(conf_preds) # [N,8732,21] 100 | neg_mask = neg.unsqueeze(2).expand_as(conf_preds) # [N,8732,21] 101 | mask = (pos_mask+neg_mask).gt(0) 102 | 103 | pos_and_neg = (pos+neg).gt(0) 104 | preds = conf_preds[mask].view(-1,self.num_classes) # [#pos+#neg,21] 105 | targets = conf_targets[pos_and_neg] # [#pos+#neg,] 106 | conf_loss = F.cross_entropy(preds, targets, size_average=False) 107 | 108 | loc_loss /= num_matched_boxes 109 | conf_loss /= num_matched_boxes 110 | 111 | return loc_loss + conf_loss 112 | -------------------------------------------------------------------------------- /MDSSD_with_self_attention/scripts/norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.nn.init as init 5 | 6 | class L2Norm(nn.Module): 7 | '''L2Norm layer across all channels and scale.''' 8 | def __init__(self, in_features,scale): 9 | super(L2Norm, self).__init__() 10 | self.weight = nn.Parameter(torch.Tensor(in_features)) 11 | self.reset_parameters(scale) 12 | 13 | def reset_parameters(self, scale): 14 | nn.init.constant_(self.weight, scale) 15 | 16 | def forward(self, x): 17 | x = F.normalize(x, dim=1) 18 | scale = self.weight[None,:,None,None] 19 | return scale * x -------------------------------------------------------------------------------- /MDSSD_with_self_attention/scripts/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | import torch.nn.functional as F 4 | import torchvision.transforms as transforms 5 | 6 | import sys 7 | from mdssd import MDSSD300 8 | from encoder import DataEncoder 9 | import cv2 10 | 11 | VOC_LABELS = ( 12 | 'ignored regions', 13 | 'pedestrian', 14 | 'people', 15 | 'bicycle', 16 | 'car', 17 | 'van', 18 | 'truck', 19 | 'tricycle', 20 | 'awning-tricycle', 21 | 'bus', 22 | 'motor', 23 | 'other' 24 | ) 25 | 26 | 27 | # Load model 28 | net = MDSSD300() 29 | checkpoint = torch.load('./checkpoint/ckpt.pth') 30 | 31 | keys = [] 32 | for k,v in checkpoint['net'].items(): 33 | if "module" in k: 34 | keys.append(k) 35 | for i in keys: 36 | checkpoint['net'][i[7:]] = checkpoint['net'][i] 37 | del checkpoint['net'][i] 38 | 39 | net.load_state_dict(checkpoint['net']) 40 | net.eval() 41 | 42 | if len(sys.argv) == 2: 43 | img_path = sys.argv[1] 44 | else: 45 | img_path = './images/img7.jpg' 46 | 47 | # Load test image 48 | img = cv2.imread(img_path) 49 | img1 = cv2.resize(img, (300, 300)) 50 | transform = transforms.Compose([transforms.ToTensor(), 51 | transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))]) 52 | img1 = transform(img1) 53 | 54 | # Forward 55 | with torch.no_grad(): 56 | x = torch.tensor(img1) 57 | loc_preds, conf = net(x.unsqueeze(0)) 58 | # Decode 59 | data_encoder = DataEncoder() 60 | boxes, labels, scores = data_encoder.decode(loc_preds.data.squeeze(0), F.softmax(conf.squeeze(0), dim=1).data) 61 | for box, label, score in zip(boxes, labels, scores): 62 | for b, s in zip(box, score): 63 | if s > 0.5: 64 | b[::2] *= img.shape[1] 65 | b[1::2] *= img.shape[0] 66 | print('label:',VOC_LABELS[int(label[0])], 'score:', score) 67 | b = list(b) 68 | cv2.rectangle(img, (b[0], b[1]), (b[2], b[3]), (255, 255, 255), 2) 69 | title = '{}: {}'.format(VOC_LABELS[int(label[0])], round(float(score[0]), 2)) 70 | cv2.putText(img, title, (b[0], b[1]), cv2.FONT_ITALIC, 0.6, (0, 255, 0), 2) 71 | cv2.imshow('img', img) 72 | cv2.waitKey(0) -------------------------------------------------------------------------------- /MDSSD_with_self_attention/scripts/train.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | import argparse 5 | import itertools 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.optim as optim 10 | import torch.nn.functional as F 11 | import torch.backends.cudnn as cudnn 12 | 13 | import torchvision 14 | import torchvision.transforms as transforms 15 | 16 | import numpy as np 17 | from mdssd import MDSSD300 18 | from datagen import ListDataset 19 | from multibox_loss import MultiBoxLoss 20 | 21 | TRAIN_IMG_DIR = '../../../../VisDrone2019/dataset/VisDrone2018-DET-train/images/' 22 | TRAIN_ANNOT_DIR = '../../../../VisDrone2019/dataset/VisDrone2018-DET-train/annotations/' 23 | VAL_IMAGE_DIR = '../../../../VisDrone2019/dataset/VisDrone2019-DET-val/images/' 24 | VAL_ANNOT_DIR = '../../../../VisDrone2019/dataset/VisDrone2019-DET-val/annotations/' 25 | 26 | lr = 0.001 27 | resume = True # Resume from checkpoint 28 | epoch = 200 29 | batch_size = 8 30 | 31 | use_cuda = torch.cuda.is_available() 32 | best_loss = float('inf') # best test loss 33 | start_epoch = 0 # start from epoch 0 or last epoch 34 | 35 | # Data 36 | print('==> Preparing data..') 37 | transform = transforms.Compose([transforms.ToTensor(), 38 | transforms.Normalize(mean=(0.356, 0.368, 0.362), std=(0.242, 0.235, 0.236))]) 39 | 40 | trainset = ListDataset(root=TRAIN_IMG_DIR, list_file=TRAIN_ANNOT_DIR, train=True, transform=transform) 41 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=8, shuffle=True, num_workers=4) 42 | 43 | valset = ListDataset(root=VAL_IMAGE_DIR, list_file=VAL_ANNOT_DIR, train=True, transform=transform) 44 | valloader = torch.utils.data.DataLoader(valset, batch_size=8, shuffle=True, num_workers=4) 45 | 46 | # Model 47 | net = MDSSD300() 48 | if resume: 49 | print('==> Resuming from checkpoint..') 50 | checkpoint = torch.load('./checkpoint/ckpt.pth') 51 | 52 | keys = [] 53 | for k,v in checkpoint['net'].items(): 54 | if "module" in k: 55 | keys.append(k) 56 | for i in keys: 57 | checkpoint['net'][i[7:]] = checkpoint['net'][i] 58 | del checkpoint['net'][i] 59 | 60 | net.load_state_dict(checkpoint['net']) 61 | best_loss = checkpoint['loss'] 62 | start_epoch = checkpoint['epoch'] 63 | else: 64 | # Convert from pretrained VGG model. 65 | try: 66 | net.load_state_dict(torch.load('../model/ssd.pth')) 67 | print('==> Pretrain model read successfully') 68 | except: 69 | print('==> Pretrain model read failed or not existed, training from init') 70 | 71 | criterion = MultiBoxLoss() 72 | 73 | if use_cuda: 74 | net = torch.nn.DataParallel(net, device_ids=[0]) 75 | net.cuda() 76 | cudnn.benchmark = True 77 | 78 | optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4) 79 | # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.25) 80 | 81 | # Training 82 | def train(epoch,prev_val_loss, last_saved): 83 | print('\nEpoch: %d' % epoch) 84 | net.train() 85 | train_loss = 0 86 | for batch_idx, (images, loc_targets, conf_targets) in enumerate(trainloader): 87 | if use_cuda: 88 | images = images.cuda() 89 | loc_targets = loc_targets.cuda() 90 | conf_targets = conf_targets.cuda() 91 | 92 | images = torch.tensor(images) 93 | loc_targets = torch.tensor(loc_targets) 94 | conf_targets = torch.tensor(conf_targets) 95 | 96 | optimizer.zero_grad() 97 | loc_preds, conf_preds = net(images) 98 | loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets) 99 | loss.backward() 100 | optimizer.step() 101 | # scheduler.step() 102 | 103 | train_loss += loss.item() 104 | 105 | if batch_idx%100 == 0: 106 | val_loss_tot = 0 107 | for batch_idx_val, (images, loc_targets, conf_targets) in enumerate(valloader): 108 | if use_cuda: 109 | images = images.cuda() 110 | loc_targets = loc_targets.cuda() 111 | conf_targets = conf_targets.cuda() 112 | 113 | images = torch.tensor(images) 114 | loc_targets = torch.tensor(loc_targets) 115 | conf_targets = torch.tensor(conf_targets) 116 | 117 | loc_preds, conf_preds = net(images) 118 | val_loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets) 119 | val_loss_tot += val_loss.item() 120 | 121 | val_loss_tot /= (batch_idx_val+1) 122 | if val_loss_tot < prev_val_loss: 123 | os.makedirs('checkpoint', exist_ok=True) 124 | torch.save({ 125 | 'epoch': epoch, 126 | 'net': net.state_dict(), 127 | 'loss': loss, 128 | }, 'checkpoint/ckpt.pth') 129 | print("Saved.") 130 | prev_val_loss = val_loss_tot 131 | last_saved = [epoch, batch_idx] 132 | 133 | print('epoch: {}, batch_idx: {},loss: {}, train_loss: {}, best_val_loss: {}, last_saved: {}'.format(epoch, batch_idx, loss.item(), train_loss/(batch_idx+1), prev_val_loss, last_saved)) 134 | 135 | return prev_val_loss, last_saved 136 | 137 | 138 | prev_val_loss = 999 139 | last_saved = [start_epoch,0] 140 | for epoch_num in range(start_epoch, start_epoch+epoch): 141 | prev_val_loss, last_saved = train(epoch_num, prev_val_loss, last_saved) 142 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyTorch_Object_Detection 2 | This repository contains Pytorch implementations of single-shot approaches for object detection in images. 3 | 4 | ## Papers 5 | - [SSD: Single Shot MultiBox Detector](https://www.cs.unc.edu/~wliu/papers/ssd.pdf) 6 | - [MDSSD: Multi-scale Deconvolutional SingleShot Detector for Small Objects](https://arxiv.org/pdf/1805.07009.pdf) 7 | - [Augmentation for small object detection](https://arxiv.org/pdf/1902.07296.pdf) 8 | - [CFENet: An Accurate and Efficient Single-Shot Object Detector for Autonomous Driving](https://arxiv.org/pdf/1806.09790.pdf) 9 | - [Comprehensive Feature Enhancement Module for Single-Shot Object Detector](https://qijiezhao.github.io/imgs/cfenetv1.pdf) 10 | -------------------------------------------------------------------------------- /SSD/scripts/convert_vgg.py: -------------------------------------------------------------------------------- 1 | '''Convert pretrained VGG model to SSD. 2 | 3 | VGG model download from PyTorch model zoo: https://download.pytorch.org/models/vgg16-397923af.pth 4 | ''' 5 | import sys 6 | sys.path.append("../src") 7 | import torch 8 | 9 | from ssd import SSD300 10 | 11 | 12 | vgg = torch.load('../model/vgg16-397923af.pth') 13 | 14 | ssd = SSD300() 15 | layer_indices = [0,2,5,7,10,12,14,17,19,21] 16 | 17 | for layer_idx in layer_indices: 18 | ssd.base[layer_idx].weight.data = vgg['features.%d.weight' % layer_idx] 19 | ssd.base[layer_idx].bias.data = vgg['features.%d.bias' % layer_idx] 20 | 21 | # [24,26,28] 22 | ssd.conv5_1.weight.data = vgg['features.24.weight'] 23 | ssd.conv5_1.bias.data = vgg['features.24.bias'] 24 | ssd.conv5_2.weight.data = vgg['features.26.weight'] 25 | ssd.conv5_2.bias.data = vgg['features.26.bias'] 26 | ssd.conv5_3.weight.data = vgg['features.28.weight'] 27 | ssd.conv5_3.bias.data = vgg['features.28.bias'] 28 | for k in ssd.state_dict(): 29 | print(k) 30 | torch.save(ssd.state_dict(), '../model/ssd.pth') 31 | -------------------------------------------------------------------------------- /SSD/scripts/datagen.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Load image/class/box from a annotation file. 3 | 4 | ''' 5 | from __future__ import print_function 6 | 7 | import os 8 | import sys 9 | import os.path 10 | 11 | import random 12 | import numpy as np 13 | 14 | import torch 15 | import torch.utils.data as data 16 | import torchvision.transforms as transforms 17 | 18 | from encoder import DataEncoder 19 | import cv2 20 | 21 | class ListDataset(data.Dataset): 22 | img_size = 300 23 | 24 | def __init__(self, root, list_file, train, transform): 25 | ''' 26 | Args: 27 | root: (str) ditectory to images. 28 | list_file: (str) path to annotation files. 29 | train: (boolean) train or test. 30 | transform: ([transforms]) image transforms. 31 | ''' 32 | self.root = root 33 | self.train = train 34 | self.transform = transform 35 | 36 | self.fnames = [] 37 | self.boxes = [] 38 | self.labels = [] 39 | 40 | self.data_encoder = DataEncoder() 41 | self.num_samples = 0 42 | 43 | for i in os.listdir(list_file): 44 | self.num_samples += 1 45 | self.fnames.append(i) 46 | box = [] 47 | labels = [] 48 | with open(os.path.join(list_file,i)) as f: 49 | f = f.read().split("\n") 50 | f = f[:-1] 51 | num_objs = len(f) 52 | 53 | for j in range(num_objs): 54 | f[j] = f[j].split(",") 55 | xmin = float(f[j][0]) 56 | ymin = float(f[j][1]) 57 | w = float(f[j][2]) 58 | h = float(f[j][3]) 59 | 60 | box.append([xmin,ymin,xmin+h,ymin+h]) 61 | labels.append(int(f[j][5])) 62 | 63 | self.boxes.append(torch.Tensor(box)) 64 | self.labels.append(torch.LongTensor(labels)) 65 | 66 | def __getitem__(self, idx): 67 | '''Load a image, and encode its bbox locations and class labels. 68 | Args: 69 | idx: (int) image index. 70 | Returns: 71 | img: (tensor) image tensor. 72 | loc_target: (tensor) location targets, sized [8732,4]. 73 | conf_target: (tensor) label targets, sized [8732,]. 74 | ''' 75 | # Load image and bbox locations. 76 | fname = self.fnames[idx] 77 | img = cv2.imread(os.path.join(self.root, fname[:-4]+".jpg")) 78 | boxes = self.boxes[idx].clone() 79 | labels = self.labels[idx] 80 | 81 | # Data augmentation while training. 82 | if self.train: 83 | img, boxes = self.random_flip(img, boxes) 84 | img, boxes, labels = self.random_crop(img, boxes, labels) 85 | 86 | # Scale bbox locaitons to [0,1]. 87 | w,h = img.shape[1], img.shape[0] 88 | boxes /= torch.Tensor([w,h,w,h]).expand_as(boxes) 89 | img = cv2.resize(img, (self.img_size,self.img_size)) 90 | img = self.transform(img) 91 | 92 | # Encode loc & conf targets. 93 | 94 | loc_target, conf_target = self.data_encoder.encode(boxes, labels) 95 | return img, loc_target, conf_target 96 | 97 | def random_flip(self, img, boxes): 98 | '''Randomly flip the image and adjust the bbox locations. 99 | For bbox (xmin, ymin, xmax, ymax), the flipped bbox is: 100 | (w-xmax, ymin, w-xmin, ymax). 101 | Args: 102 | img: (ndarray.Image) image. f 103 | boxes: (tensor) bbox locations, sized [#obj, 4]. 104 | Returns: 105 | img: (ndarray.Image) randomly flipped image. 106 | boxes: (tensor) randomly flipped bbox locations, sized [#obj, 4]. 107 | ''' 108 | if random.random() < 0.5: 109 | img = cv2.flip(img, 1) 110 | w = img.shape[1] 111 | xmin = w - boxes[:,2] 112 | xmax = w - boxes[:,0] 113 | boxes[:,0] = xmin 114 | boxes[:,2] = xmax 115 | return img, boxes 116 | 117 | def random_crop(self, img, boxes, labels): 118 | '''Randomly crop the image and adjust the bbox locations. 119 | For more details, see 'Chapter2.2: Data augmentation' of the paper. 120 | Args: 121 | img: (ndarray.Image) image. 122 | boxes: (tensor) bbox locations, sized [#obj, 4]. 123 | labels: (tensor) bbox labels, sized [#obj,]. 124 | Returns: 125 | img: (ndarray.Image) cropped image. 126 | selected_boxes: (tensor) selected bbox locations. 127 | labels: (tensor) selected bbox labels. 128 | ''' 129 | imw, imh = img.shape[1], img.shape[0] 130 | while True: 131 | min_iou = random.choice([None, 0.1, 0.3, 0.5, 0.7, 0.9])# random choice the one 132 | if min_iou is None: 133 | return img, boxes, labels 134 | 135 | for _ in range(100): 136 | w = random.randrange(int(0.1*imw), imw) 137 | h = random.randrange(int(0.1*imh), imh) 138 | 139 | if h > 2*w or w > 2*h or h < 1 or w < 1: 140 | continue 141 | 142 | x = random.randrange(imw - w) 143 | y = random.randrange(imh - h) 144 | roi = torch.Tensor([[x, y, x+w, y+h]]) 145 | 146 | center = (boxes[:,:2] + boxes[:,2:]) / 2 # [N,2] 147 | roi2 = roi.expand(len(center), 4) # [N,4] 148 | 149 | mask = (center > roi2[:,:2]) & (center < roi2[:,2:]) # [N,2] 150 | mask = mask[:,0] & mask[:,1] #[N,] 151 | 152 | if not mask.any(): 153 | continue 154 | 155 | selected_boxes = boxes.index_select(0, mask.nonzero().squeeze(1)) 156 | 157 | iou = self.data_encoder.iou(selected_boxes, roi) 158 | if iou.min() < min_iou: 159 | continue 160 | img = img[y:y+h, x:x+w, :] 161 | 162 | selected_boxes[:,0].add_(-x).clamp_(min=0, max=w) 163 | selected_boxes[:,1].add_(-y).clamp_(min=0, max=h) 164 | selected_boxes[:,2].add_(-x).clamp_(min=0, max=w) 165 | selected_boxes[:,3].add_(-y).clamp_(min=0, max=h) 166 | 167 | return img, selected_boxes, labels[mask] 168 | 169 | def __len__(self): 170 | return self.num_samples -------------------------------------------------------------------------------- /SSD/scripts/encoder.py: -------------------------------------------------------------------------------- 1 | '''Encode target locations and labels.''' 2 | import torch 3 | 4 | import math 5 | import itertools 6 | 7 | class DataEncoder: 8 | def __init__(self): 9 | '''Compute default box sizes with scale and aspect transform.''' 10 | scale = 300. 11 | steps = [s / scale for s in (8, 16, 32, 64, 100, 300)] 12 | sizes = [s / scale for s in (30, 60, 111, 162, 213, 264, 315)] 13 | aspect_ratios = ((2,), (2,3), (2,3), (2,3), (2,), (2,)) 14 | feature_map_sizes = (38, 19, 10, 5, 3, 1) 15 | num_layers = len(feature_map_sizes) 16 | 17 | boxes = [] 18 | for i in range(num_layers): 19 | fmsize = feature_map_sizes[i] # feature map size 20 | for h,w in itertools.product(range(fmsize), repeat=2): 21 | cx = (w + 0.5)*steps[i] 22 | cy = (h + 0.5)*steps[i] 23 | 24 | s = sizes[i] 25 | boxes.append((cx, cy, s, s)) 26 | 27 | s = math.sqrt(sizes[i] * sizes[i+1]) 28 | boxes.append((cx, cy, s, s)) 29 | 30 | s = sizes[i] 31 | for ar in aspect_ratios[i]: 32 | boxes.append((cx, cy, s * math.sqrt(ar), s / math.sqrt(ar))) 33 | boxes.append((cx, cy, s / math.sqrt(ar), s * math.sqrt(ar))) 34 | 35 | self.default_boxes = torch.Tensor(boxes) 36 | 37 | def iou(self, box1, box2): 38 | '''Compute the intersection over union of two set of boxes, each box is [x1,y1,x2,y2]. 39 | 40 | Args: 41 | box1: (tensor) bounding boxes, sized [N,4]. 42 | box2: (tensor) bounding boxes, sized [M,4]. 43 | 44 | Return: 45 | (tensor) iou, sized [N,M]. 46 | ''' 47 | N = box1.size(0) 48 | M = box2.size(0) 49 | 50 | lt = torch.max( 51 | box1[:,:2].unsqueeze(1).expand(N,M,2), # [N,2] -> [N,1,2] -> [N,M,2] 52 | box2[:,:2].unsqueeze(0).expand(N,M,2), # [M,2] -> [1,M,2] -> [N,M,2] 53 | ) 54 | 55 | rb = torch.min( 56 | box1[:,2:].unsqueeze(1).expand(N,M,2), # [N,2] -> [N,1,2] -> [N,M,2] 57 | box2[:,2:].unsqueeze(0).expand(N,M,2), # [M,2] -> [1,M,2] -> [N,M,2] 58 | ) 59 | 60 | wh = rb - lt # [N,M,2] 61 | wh[wh<0] = 0 # clip at 0 62 | inter = wh[:,:,0] * wh[:,:,1] # [N,M] 63 | 64 | area1 = (box1[:,2]-box1[:,0]) * (box1[:,3]-box1[:,1]) # [N,] 65 | area2 = (box2[:,2]-box2[:,0]) * (box2[:,3]-box2[:,1]) # [M,] 66 | area1 = area1.unsqueeze(1).expand_as(inter) # [N,] -> [N,1] -> [N,M] 67 | area2 = area2.unsqueeze(0).expand_as(inter) # [M,] -> [1,M] -> [N,M] 68 | 69 | iou = inter / (area1 + area2 - inter) 70 | return iou 71 | 72 | def encode(self, boxes, classes, threshold=0.5): 73 | '''Transform target bounding boxes and class labels to SSD boxes and classes. 74 | 75 | Match each object box to all the default boxes, pick the ones with the 76 | Jaccard-Index > 0.5: 77 | Jaccard(A,B) = AB / (A+B-AB) 78 | 79 | Args: 80 | boxes: (tensor) object bounding boxes (xmin,ymin,xmax,ymax) of a image, sized [#obj, 4]. 81 | classes: (tensor) object class labels of a image, sized [#obj,]. 82 | threshold: (float) Jaccard index threshold 83 | 84 | Returns: 85 | boxes: (tensor) bounding boxes, sized [#obj, 8732, 4]. 86 | classes: (tensor) class labels, sized [8732,] 87 | ''' 88 | default_boxes = self.default_boxes 89 | num_default_boxes = default_boxes.size(0) 90 | num_objs = boxes.size(0) 91 | 92 | iou = self.iou( # [#obj,8732] 93 | boxes, 94 | torch.cat([default_boxes[:,:2] - default_boxes[:,2:]/2, 95 | default_boxes[:,:2] + default_boxes[:,2:]/2], 1) 96 | ) 97 | 98 | iou, max_idx = iou.max(0) # [1,8732] 99 | max_idx.squeeze_(0) # [8732,] 100 | iou.squeeze_(0) # [8732,] 101 | 102 | boxes = boxes[max_idx] # [8732,4] 103 | variances = [0.1, 0.2] 104 | cxcy = (boxes[:,:2] + boxes[:,2:])/2 - default_boxes[:,:2] # [8732,2] 105 | cxcy /= variances[0] * default_boxes[:,2:] 106 | wh = (boxes[:,2:] - boxes[:,:2]) / default_boxes[:,2:] # [8732,2] 107 | wh = torch.log(wh) / variances[1] 108 | loc = torch.cat([cxcy, wh], 1) # [8732,4] 109 | 110 | conf = 1 + classes[max_idx] # [8732,], background class = 0 111 | conf[iou 0: 138 | i = order[0] 139 | keep.append(i) 140 | 141 | if order.numel() == 1: 142 | break 143 | 144 | xx1 = x1[order[1:]].clamp(min=x1[i]) 145 | yy1 = y1[order[1:]].clamp(min=y1[i]) 146 | xx2 = x2[order[1:]].clamp(max=x2[i]) 147 | yy2 = y2[order[1:]].clamp(max=y2[i]) 148 | 149 | w = (xx2-xx1).clamp(min=0) 150 | h = (yy2-yy1).clamp(min=0) 151 | inter = w*h 152 | 153 | if mode == 'union': 154 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 155 | elif mode == 'min': 156 | ovr = inter / areas[order[1:]].clamp(max=areas[i]) 157 | else: 158 | raise TypeError('Unknown nms mode: %s.' % mode) 159 | 160 | ids = (ovr<=threshold).nonzero().squeeze() 161 | if ids.numel() == 0: 162 | break 163 | order = order[ids+1] 164 | return torch.LongTensor(keep) 165 | 166 | def decode(self, loc, conf): 167 | '''Transform predicted loc/conf back to real bbox locations and class labels. 168 | 169 | Args: 170 | loc: (tensor) predicted loc, sized [8732,4]. 171 | conf: (tensor) predicted conf, sized [8732,21]. 172 | 173 | Returns: 174 | boxes: (tensor) bbox locations, sized [#obj, 4]. 175 | labels: (tensor) class labels, sized [#obj,1]. 176 | ''' 177 | variances = (0.1, 0.2) 178 | wh = torch.exp(loc[:,2:]*variances[1]) * self.default_boxes[:,2:] 179 | cxcy = loc[:,:2] * variances[0] * self.default_boxes[:,2:] + self.default_boxes[:,:2] 180 | box_preds = torch.cat([cxcy-wh/2, cxcy+wh/2], 1) # [8732,4] 181 | 182 | boxes = [] 183 | labels = [] 184 | scores = [] 185 | num_classes = conf.size(1) 186 | for i in range(num_classes-1): 187 | score = conf[:,i+1] # class i corresponds to (i+1) column 188 | mask = score > 0.1 189 | 190 | if not mask.any(): 191 | continue 192 | 193 | box = box_preds[mask.nonzero().squeeze()] 194 | score = score[mask] 195 | 196 | if len(score) == 1: 197 | continue 198 | keep = self.nms(box, score, threshold=0.3) 199 | boxes.append(box[keep]) 200 | labels.append(torch.LongTensor(len(box[keep])).fill_(i)) 201 | scores.append(score[keep]) 202 | 203 | return boxes, labels, scores 204 | -------------------------------------------------------------------------------- /SSD/scripts/multibox_layer.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import math 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.init as init 8 | import torch.nn.functional as F 9 | 10 | 11 | class MultiBoxLayer(nn.Module): 12 | num_classes = 21 13 | num_anchors = [4,6,6,6,4,4] 14 | in_planes = [512,1024,512,256,256,256] 15 | 16 | def __init__(self): 17 | super(MultiBoxLayer, self).__init__() 18 | 19 | self.loc_layers = nn.ModuleList() 20 | self.conf_layers = nn.ModuleList() 21 | for i in range(len(self.in_planes)): 22 | self.loc_layers.append(nn.Conv2d(self.in_planes[i], self.num_anchors[i]*4, kernel_size=3, padding=1)) 23 | self.conf_layers.append(nn.Conv2d(self.in_planes[i], self.num_anchors[i]*21, kernel_size=3, padding=1)) 24 | 25 | def forward(self, xs): 26 | ''' 27 | Args: 28 | xs: (list) of tensor containing intermediate layer outputs. 29 | 30 | Returns: 31 | loc_preds: (tensor) predicted locations, sized [N,8732,4]. 32 | conf_preds: (tensor) predicted class confidences, sized [N,8732,21]. 33 | ''' 34 | y_locs = [] 35 | y_confs = [] 36 | for i, x in enumerate(xs): 37 | y_loc = self.loc_layers[i](x) 38 | N = y_loc.size(0) 39 | y_loc = y_loc.permute(0,2,3,1).contiguous() 40 | y_loc = y_loc.view(N,-1,4) 41 | y_locs.append(y_loc) 42 | 43 | y_conf = self.conf_layers[i](x) 44 | y_conf = y_conf.permute(0,2,3,1).contiguous() 45 | y_conf = y_conf.view(N,-1,21) 46 | y_confs.append(y_conf) 47 | 48 | loc_preds = torch.cat(y_locs, 1) 49 | conf_preds = torch.cat(y_confs, 1) 50 | return loc_preds, conf_preds 51 | -------------------------------------------------------------------------------- /SSD/scripts/multibox_loss.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import math 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.init as init 8 | import torch.nn.functional as F 9 | 10 | 11 | class MultiBoxLoss(nn.Module): 12 | num_classes = 21 13 | 14 | def __init__(self): 15 | super(MultiBoxLoss, self).__init__() 16 | 17 | def cross_entropy_loss(self, x, y): 18 | '''Cross entropy loss w/o averaging across all samples. 19 | 20 | Args: 21 | x: (tensor) sized [N,D]. 22 | y: (tensor) sized [N,]. 23 | 24 | Return: 25 | (tensor) cross entroy loss, sized [N,]. 26 | ''' 27 | xmax = x.data.max() 28 | print('x y size {} {}'.format(x.size(), y.size())) 29 | log_sum_exp = torch.log(torch.sum(torch.exp(x-xmax), 1)) + xmax 30 | print('log_sum_exp {}'.format(log_sum_exp.size())) 31 | return log_sum_exp - x.gather(1, y.view(-1,1)) 32 | 33 | def hard_negative_mining(self, conf_loss, pos): 34 | '''Return negative indices that is 3x the number as postive indices. 35 | 36 | Args: 37 | conf_loss: (tensor) cross entroy loss between conf_preds and conf_targets, sized [N*8732,]. 38 | pos: (tensor) positive(matched) box indices, sized [N,8732]. 39 | 40 | Return: 41 | (tensor) negative indices, sized [N,8732]. 42 | ''' 43 | batch_size, num_boxes = pos.size() 44 | conf_loss[pos.view(-1)] = 0 # set pos boxes = 0, the rest are neg conf_loss 45 | conf_loss = conf_loss.view(batch_size, -1) # [N,8732] 46 | 47 | _,idx = conf_loss.sort(1, descending=True) # sort by neg conf_loss 48 | _,rank = idx.sort(1) # [N,8732] 49 | 50 | num_pos = pos.long().sum(1) # [N,1] 51 | num_neg = torch.clamp(3*num_pos, max=num_boxes-1) # [N,1] 52 | 53 | neg = rank < num_neg.unsqueeze(1).expand_as(rank) 54 | 55 | return neg 56 | 57 | def forward(self, loc_preds, loc_targets, conf_preds, conf_targets): 58 | '''Compute loss between (loc_preds, loc_targets) and (conf_preds, conf_targets). 59 | 60 | Args: 61 | loc_preds: (tensor) predicted locations, sized [batch_size, 8732, 4]. 62 | loc_targets: (tensor) encoded target locations, sized [batch_size, 8732, 4]. 63 | conf_preds: (tensor) predicted class confidences, sized [batch_size, 8732, num_classes]. 64 | conf_targets: (tensor) encoded target classes, sized [batch_size, 8732]. 65 | 66 | loss: 67 | (tensor) loss = SmoothL1Loss(loc_preds, loc_targets) + CrossEntropyLoss(conf_preds, conf_targets). 68 | ''' 69 | batch_size, num_boxes, _ = loc_preds.size() 70 | pos = conf_targets > 0 # [N,8732], pos means the box matched. 71 | num_matched_boxes = pos.data.float().sum() 72 | if num_matched_boxes == 0: 73 | return torch.tensor([0], requires_grad=True) 74 | 75 | ################################################################ 76 | # loc_loss = SmoothL1Loss(pos_loc_preds, pos_loc_targets) 77 | ################################################################ 78 | pos_mask = pos.unsqueeze(2).expand_as(loc_preds) # [N,8732,4] 79 | pos_loc_preds = loc_preds[pos_mask].view(-1,4) # [#pos,4] 80 | pos_loc_targets = loc_targets[pos_mask].view(-1,4) # [#pos,4] 81 | 82 | loc_loss = F.smooth_l1_loss(pos_loc_preds, pos_loc_targets, size_average=False) 83 | 84 | ################################################################ 85 | # conf_loss = CrossEntropyLoss(pos_conf_preds, pos_conf_targets) 86 | # + CrossEntropyLoss(neg_conf_preds, neg_conf_targets) 87 | ################################################################ 88 | conf_loss = F.cross_entropy(conf_preds.view(-1,self.num_classes), \ 89 | conf_targets.view(-1), reduce=False) # [N*8732,] 90 | neg = self.hard_negative_mining(conf_loss, pos) # [N,8732] 91 | 92 | pos_mask = pos.unsqueeze(2).expand_as(conf_preds) # [N,8732,21] 93 | neg_mask = neg.unsqueeze(2).expand_as(conf_preds) # [N,8732,21] 94 | mask = (pos_mask+neg_mask).gt(0) 95 | 96 | pos_and_neg = (pos+neg).gt(0) 97 | preds = conf_preds[mask].view(-1,self.num_classes) # [#pos+#neg,21] 98 | targets = conf_targets[pos_and_neg] # [#pos+#neg,] 99 | conf_loss = F.cross_entropy(preds, targets, size_average=False) 100 | 101 | loc_loss /= num_matched_boxes 102 | conf_loss /= num_matched_boxes 103 | 104 | return loc_loss + conf_loss 105 | -------------------------------------------------------------------------------- /SSD/scripts/ssd.py: -------------------------------------------------------------------------------- 1 | import math 2 | import itertools 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torch.nn.init as init 8 | 9 | from torch.autograd import Variable 10 | 11 | from multibox_layer import MultiBoxLayer 12 | 13 | 14 | class L2Norm(nn.Module): 15 | '''L2Norm layer across all channels and scale.''' 16 | def __init__(self, in_features,scale): 17 | super(L2Norm, self).__init__() 18 | self.weight = nn.Parameter(torch.Tensor(in_features)) 19 | self.reset_parameters(scale) 20 | 21 | def reset_parameters(self, scale): 22 | nn.init.constant_(self.weight, scale) 23 | 24 | def forward(self, x): 25 | x = F.normalize(x, dim=1) 26 | scale = self.weight[None,:,None,None] 27 | return scale * x 28 | 29 | 30 | class SSD300(nn.Module): 31 | input_size = 300 32 | 33 | def __init__(self): 34 | super(SSD300, self).__init__() 35 | 36 | # model 37 | self.base = self.VGG16() 38 | self.norm4 = L2Norm(512, 20) # 38 39 | 40 | self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1) 41 | self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1) 42 | self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1) 43 | 44 | self.conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) 45 | 46 | self.conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 47 | 48 | self.conv8_1 = nn.Conv2d(1024, 256, kernel_size=1) 49 | self.conv8_2 = nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2) 50 | 51 | self.conv9_1 = nn.Conv2d(512, 128, kernel_size=1) 52 | self.conv9_2 = nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2) 53 | 54 | self.conv10_1 = nn.Conv2d(256, 128, kernel_size=1) 55 | self.conv10_2 = nn.Conv2d(128, 256, kernel_size=3) 56 | 57 | self.conv11_1 = nn.Conv2d(256, 128, kernel_size=1) 58 | self.conv11_2 = nn.Conv2d(128, 256, kernel_size=3) 59 | 60 | # multibox layer 61 | self.multibox = MultiBoxLayer() 62 | 63 | def forward(self, x): 64 | hs = [] 65 | 66 | h = self.base(x) 67 | hs.append(self.norm4(h)) # conv4_3 68 | h = F.max_pool2d(h, kernel_size=2, stride=2, ceil_mode=True) 69 | 70 | h = F.relu(self.conv5_1(h)) 71 | h = F.relu(self.conv5_2(h)) 72 | h = F.relu(self.conv5_3(h)) 73 | h = F.max_pool2d(h, kernel_size=3, padding=1, stride=1, ceil_mode=True) 74 | 75 | h = F.relu(self.conv6(h)) 76 | h = F.relu(self.conv7(h)) 77 | hs.append(h) # conv7 78 | h = F.relu(self.conv8_1(h)) 79 | h = F.relu(self.conv8_2(h)) 80 | hs.append(h) # conv8_2 81 | h = F.relu(self.conv9_1(h)) 82 | h = F.relu(self.conv9_2(h)) 83 | hs.append(h) # conv9_2 84 | h = F.relu(self.conv10_1(h)) 85 | h = F.relu(self.conv10_2(h)) 86 | hs.append(h) # conv10_2 87 | h = F.relu(self.conv11_1(h)) 88 | h = F.relu(self.conv11_2(h)) 89 | hs.append(h) # conv11_2 90 | loc_preds, conf_preds = self.multibox(hs) 91 | 92 | return loc_preds, conf_preds 93 | 94 | def VGG16(self): 95 | '''VGG16 layers.''' 96 | cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512] 97 | layers = [] 98 | in_channels = 3 99 | for x in cfg: 100 | if x == 'M': 101 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 102 | else: 103 | layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1), 104 | nn.ReLU(True)] 105 | in_channels = x 106 | return nn.Sequential(*layers) 107 | 108 | if __name__ == '__main__': 109 | t = torch.randn(1, 3, 300, 300) 110 | net = SSD300() 111 | res = net.forward(t) 112 | -------------------------------------------------------------------------------- /SSD/scripts/train.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | import argparse 5 | import itertools 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.optim as optim 10 | import torch.nn.functional as F 11 | import torch.backends.cudnn as cudnn 12 | 13 | import torchvision 14 | import torchvision.transforms as transforms 15 | 16 | import numpy as np 17 | from ssd import SSD300 18 | from datagen import ListDataset 19 | from multibox_loss import MultiBoxLoss 20 | 21 | lr = 0.001 22 | resume = False # Resume from checkpoint 23 | epoch = 200 24 | 25 | use_cuda = torch.cuda.is_available() 26 | best_loss = float('inf') # best test loss 27 | start_epoch = 0 # start from epoch 0 or last epoch 28 | 29 | # Data 30 | print('==> Preparing data..') 31 | transform = transforms.Compose([transforms.ToTensor(), 32 | transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))]) 33 | 34 | # trainset = ListDataset(root='../data/train/images/', list_file='../data/train/annotations/', train=True, transform=transform) 35 | # trainloader = torch.utils.data.DataLoader(trainset, batch_size=8, shuffle=True, num_workers=4) 36 | 37 | # Model 38 | net = SSD300() 39 | # if resume: 40 | # print('==> Resuming from checkpoint..') 41 | # checkpoint = torch.load('../checkpoint/ckpt.pth') 42 | # net.load_state_dict(checkpoint['net']) 43 | # best_loss = checkpoint['loss'] 44 | # start_epoch = checkpoint['epoch'] 45 | # else: 46 | # # Convert from pretrained VGG model. 47 | # try: 48 | # net.load_state_dict(torch.load('../model/ssd.pth')) 49 | # print('==> Pretrain model read successfully') 50 | # except: 51 | # print('==> Pretrain model read failed or not existed, training from init') 52 | print("Loaded Model") 53 | 54 | 55 | 56 | # criterion = MultiBoxLoss() 57 | 58 | # if use_cuda: 59 | # net = torch.nn.DataParallel(net, device_ids=[0]) 60 | # net.cuda() 61 | # cudnn.benchmark = True 62 | 63 | # optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4) 64 | 65 | # # Training 66 | # def train(epoch): 67 | # print('\nEpoch: %d' % epoch) 68 | # net.train() 69 | # train_loss = 0 70 | # # for batch_idx, (images, loc_targets, conf_targets) in enumerate(trainloader): 71 | # # if use_cuda: 72 | # # images = images.cuda() 73 | # # loc_targets = loc_targets.cuda() 74 | # # conf_targets = conf_targets.cuda() 75 | 76 | # # images = torch.tensor(images) 77 | # # loc_targets = torch.tensor(loc_targets) 78 | # # conf_targets = torch.tensor(conf_targets) 79 | 80 | # # optimizer.zero_grad() 81 | # # loc_preds, conf_preds = net(images) 82 | # # loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets) 83 | # # loss.backward() 84 | # # optimizer.step() 85 | 86 | # # train_loss += loss.item() 87 | # # if batch_idx%100 == 0: 88 | # # os.makedirs('checkpoint', exist_ok=True) 89 | # # torch.save({ 90 | # # 'epoch': epoch, 91 | # # 'net': net.module.state_dict(), 92 | # # 'loss': loss, 93 | # # }, 'checkpoint/ckpt.pth') 94 | # # print('epoch: {}, batch_idx: {},loss: {}, train_loss: {}'.format(epoch, batch_idx, loss.item(), train_loss/(batch_idx+1))) 95 | 96 | 97 | # for epoch_num in range(1):#range(start_epoch, start_epoch+epoch): 98 | # train(epoch_num) --------------------------------------------------------------------------------