├── .gitignore
├── CFENet
    └── scripts
    │   ├── cfenet.py
    │   ├── datagen.py
    │   ├── encoder.py
    │   ├── multibox_layer.py
    │   ├── multibox_loss.py
    │   ├── train.py
    │   └── util.py
├── MDSSD_300
    └── scripts
    │   ├── datagen.py
    │   ├── encoder.py
    │   ├── fusion.py
    │   ├── mdssd.py
    │   ├── multibox_layer.py
    │   ├── multibox_loss.py
    │   ├── norm.py
    │   ├── test.py
    │   └── train.py
├── MDSSD_512
    └── scripts
    │   ├── datagen.py
    │   ├── encoder.py
    │   ├── fusion.py
    │   ├── mdssd.py
    │   ├── multibox_layer.py
    │   ├── multibox_loss.py
    │   ├── norm.py
    │   ├── test.py
    │   └── train.py
├── MDSSD_augment
    └── scripts
    │   ├── augment.py
    │   ├── datagen.py
    │   ├── encoder.py
    │   ├── fusion.py
    │   ├── mdssd.py
    │   ├── multibox_layer.py
    │   ├── multibox_loss.py
    │   ├── norm.py
    │   ├── test.py
    │   └── train.py
├── MDSSD_with_self_attention
    └── scripts
    │   ├── attention.py
    │   ├── datagen.py
    │   ├── encoder.py
    │   ├── fusion.py
    │   ├── gen_test_file.py
    │   ├── mdssd.py
    │   ├── multibox_layer.py
    │   ├── multibox_loss.py
    │   ├── norm.py
    │   ├── test.py
    │   └── train.py
├── README.md
└── SSD
    └── scripts
        ├── convert_vgg.py
        ├── datagen.py
        ├── encoder.py
        ├── multibox_layer.py
        ├── multibox_loss.py
        ├── ssd.py
        └── train.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/CFENet/scripts/cfenet.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import itertools
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | import torch.nn.init as init
  8 | 
  9 | from torch.autograd import Variable
 10 | 
 11 | from multibox_layer import MultiBoxLayer
 12 | from util import CFE, FFB
 13 | 
 14 | 
 15 | class L2Norm(nn.Module):
 16 |     '''L2Norm layer across all channels and scale.'''
 17 |     def __init__(self, in_features,scale):
 18 |         super(L2Norm, self).__init__()
 19 |         self.weight = nn.Parameter(torch.Tensor(in_features))
 20 |         self.reset_parameters(scale)
 21 | 
 22 |     def reset_parameters(self, scale):
 23 |         nn.init.constant_(self.weight, scale)
 24 | 
 25 |     def forward(self, x):
 26 |         x = F.normalize(x, dim=1)
 27 |         scale = self.weight[None,:,None,None]
 28 |         return scale * x
 29 | 
 30 | 
 31 | class CFENet(nn.Module):
 32 |     input_size = 300
 33 | 
 34 |     def __init__(self):
 35 |         super(CFENet, self).__init__()
 36 |         
 37 |         # model
 38 |         self.base = self.VGG16()
 39 |         self.norm4 = L2Norm(512, 20) # 38
 40 | 
 41 |         self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1)
 42 |         self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1)
 43 |         self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1)
 44 | 
 45 |         self.conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
 46 | 
 47 |         self.conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 
 48 | 
 49 |         self.conv8_1 = nn.Conv2d(1024, 256, kernel_size=1)
 50 |         self.conv8_2 = nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2)
 51 | 
 52 |         self.conv9_1 = nn.Conv2d(512, 128, kernel_size=1)
 53 |         self.conv9_2 = nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2)
 54 | 
 55 |         self.conv10_1 = nn.Conv2d(256, 128, kernel_size=1)
 56 |         self.conv10_2 = nn.Conv2d(128, 256, kernel_size=3)
 57 | 
 58 |         self.conv11_1 = nn.Conv2d(256, 128, kernel_size=1)
 59 |         self.conv11_2 = nn.Conv2d(128, 256, kernel_size=3)
 60 | 
 61 |         # CFE
 62 |         self.cfe1 = CFE(512, 3)
 63 |         self.cfe2 = CFE(512, 3)
 64 |         self.cfe3 = CFE(512, 3)
 65 |         self.cfe4 = CFE(512, 3)
 66 | 
 67 |         # FFB
 68 |         self.ffb1 = FFB(512,512)
 69 |         self.ffb2 = FFB(512,512)
 70 | 
 71 |         # multibox layer
 72 |         self.multibox = MultiBoxLayer()
 73 | 
 74 |     def forward(self, x):
 75 |         hs = []
 76 |         ffb = []
 77 | 
 78 |         h = self.base(x)
 79 |         # hs.append(self.norm4(h))  # conv4_3
 80 |         ffb.append(h)
 81 |         h = self.cfe1(h)
 82 |         h = F.max_pool2d(h, kernel_size=2, stride=2, ceil_mode=True)
 83 | 
 84 |         h = F.relu(self.conv5_1(h))
 85 |         h = F.relu(self.conv5_2(h))
 86 |         h = F.relu(self.conv5_3(h))
 87 |         ffb.append(h)
 88 |         h = self.cfe2(h)
 89 |         h = F.max_pool2d(h, kernel_size=3, padding=1, stride=1, ceil_mode=True)
 90 |         
 91 |         h = F.relu(self.conv6(h))
 92 |         h = F.relu(self.conv7(h))
 93 |         # hs.append(h)  # conv7
 94 |         h = F.relu(self.conv8_1(h))
 95 |         h = F.relu(self.conv8_2(h))
 96 |         hs.append(h)  # conv8_2
 97 |         ffb.append(h)
 98 |         h = F.relu(self.conv9_1(h))
 99 |         h = F.relu(self.conv9_2(h))
100 |         hs.append(h)  # conv9_2
101 |         h = F.relu(self.conv10_1(h))
102 |         h = F.relu(self.conv10_2(h))
103 |         hs.append(h)  # conv10_2
104 |         h = F.relu(self.conv11_1(h))
105 |         h = F.relu(self.conv11_2(h))
106 |         hs.append(h)  # conv11_2
107 | 
108 |         # Feature fusion blocks followed by Comprehensive Feature Enhancement(CFE) module
109 |         f1 = self.ffb1(ffb[0], ffb[1])
110 |         f1 = self.cfe3(f1)
111 |         hs.append(f1)
112 |         f2 = self.ffb2(ffb[1], ffb[2])
113 |         f2 = self.cfe4(f2)
114 |         hs.append(f2)
115 | 
116 |         loc_preds, conf_preds = self.multibox(hs)
117 |  
118 |         return loc_preds, conf_preds
119 | 
120 |     def VGG16(self):
121 |         '''VGG16 layers.'''
122 |         cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512]
123 |         layers = []
124 |         in_channels = 3
125 |         for x in cfg:
126 |             if x == 'M':
127 |                 layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
128 |             else:
129 |                 layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
130 |                            nn.ReLU(True)]
131 |                 in_channels = x
132 |         return nn.Sequential(*layers)
133 | 
134 | if __name__ == '__main__':
135 |     t = torch.randn(1, 3, 300, 300)
136 |     net = CFENet()
137 |     # res = net.forward(t)
138 |     print(net.base)
139 |     


--------------------------------------------------------------------------------
/CFENet/scripts/datagen.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Load image/class/box from a annotation file.
  3 | 
  4 | '''
  5 | from __future__ import print_function
  6 | 
  7 | import os
  8 | import sys
  9 | import os.path
 10 | 
 11 | import random
 12 | import numpy as np
 13 | 
 14 | import torch
 15 | import torch.utils.data as data
 16 | import torchvision.transforms as transforms
 17 | 
 18 | from encoder import DataEncoder
 19 | import cv2
 20 | 
 21 | class ListDataset(data.Dataset):
 22 |     img_size = 300
 23 | 
 24 |     def __init__(self, root, list_file, train, transform):
 25 |         '''
 26 |         Args:
 27 |           root: (str) ditectory to images.
 28 |           list_file: (str) path to annotation files.
 29 |           train: (boolean) train or test.
 30 |           transform: ([transforms]) image transforms.
 31 |         '''
 32 |         self.root = root
 33 |         self.list_file = list_file
 34 |         self.train = train
 35 |         self.transform = transform
 36 | 
 37 |         self.fnames = []
 38 |         self.boxes = []
 39 |         self.labels = []
 40 | 
 41 |         self.data_encoder = DataEncoder()
 42 |         self.num_samples = 0
 43 | 
 44 |         # VisDrone
 45 | 
 46 |         for i in os.listdir(list_file):
 47 |             self.num_samples += 1
 48 |             self.fnames.append(i)
 49 | 
 50 |     def __getitem__(self, idx):
 51 |         '''Load a image, and encode its bbox locations and class labels.
 52 |         Args:
 53 |           idx: (int) image index.
 54 |         Returns:
 55 |           img: (tensor) image tensor.
 56 |           loc_target: (tensor) location targets, sized [8732,4].
 57 |           conf_target: (tensor) label targets, sized [8732,].
 58 |         '''
 59 |         # Load image and bbox locations.
 60 |         fname = self.fnames[idx]
 61 |         img = cv2.imread(os.path.join(self.root, fname[:-4]+".jpg"))
 62 | 
 63 |         box = []
 64 |         label = []
 65 |         with open(os.path.join(self.list_file,fname)) as f:
 66 |             f = f.read().split("\n")
 67 |             f = f[:-1]
 68 |         num_objs = len(f)
 69 | 
 70 |         for j in range(num_objs):
 71 |             f[j] = f[j].split(",")
 72 |             xmin = float(f[j][0])
 73 |             ymin = float(f[j][1])
 74 |             w = float(f[j][2])
 75 |             h = float(f[j][3])
 76 | 
 77 |             box.append([xmin,ymin,xmin+w,ymin+h])
 78 |             label.append(int(f[j][5]))
 79 | 
 80 |         '''
 81 |         # **************************** AUGMENTATION ************************************
 82 |         # Copy and paste small objects at random locations in
 83 |         # image to increase the number of samples with small sizes.
 84 |         box_new = box.copy()
 85 |         label_new = label.copy()
 86 |         img_new = img.copy()
 87 |         for n in range(len(box)):
 88 |             j = box[n]   
 89 |             if j[2]*j[3]<500:
 90 |                 crop = img[int(j[1]):int(j[1]+j[3]),int(j[0]):int(j[0]+j[2])]
 91 |                 x = random.randrange(0, img.shape[1],1)
 92 |                 y = random.randrange(0, img.shape[0],1)
 93 | 
 94 |                 try:
 95 |                     img_new[int(y):int(y+j[3]),int(x):int(x+j[2])] = crop
 96 |                     box_new.append([x,y,j[2],j[3]])
 97 |                     label_new.append(label[n])
 98 |                 except:
 99 |                     continue
100 | 
101 |         # ********************************************************************************
102 | 
103 |         self.boxes.append(torch.Tensor(box_new))
104 |         self.labels.append(torch.LongTensor(label_new))
105 |         img = img_new
106 |         '''
107 | 
108 |         self.boxes.append(torch.Tensor(box))
109 |         self.labels.append(torch.LongTensor(label))
110 |         
111 |         boxes = self.boxes[-1].clone()
112 |         labels = self.labels[-1]
113 | 
114 |         # Data augmentation while training.
115 |         if self.train:
116 |             img, boxes = self.random_flip(img, boxes)
117 |             img, boxes, labels = self.random_crop(img, boxes, labels)
118 | 
119 |         # Scale bbox locaitons to [0,1].
120 |         w,h = img.shape[1], img.shape[0]
121 |         boxes /= torch.Tensor([w,h,w,h]).expand_as(boxes)
122 |         img = cv2.resize(img, (self.img_size,self.img_size))
123 |         img = self.transform(img)
124 | 
125 |         # Encode loc & conf targets.
126 |         
127 |         loc_target, conf_target = self.data_encoder.encode(boxes, labels)
128 |         return img, loc_target, conf_target
129 | 
130 |     def random_flip(self, img, boxes):
131 |         '''Randomly flip the image and adjust the bbox locations.
132 |         For bbox (xmin, ymin, xmax, ymax), the flipped bbox is:
133 |         (w-xmax, ymin, w-xmin, ymax).
134 |         Args:
135 |           img: (ndarray.Image) image. f
136 |           boxes: (tensor) bbox locations, sized [#obj, 4].
137 |         Returns:
138 |           img: (ndarray.Image) randomly flipped image.
139 |           boxes: (tensor) randomly flipped bbox locations, sized [#obj, 4].
140 |         '''
141 |         if random.random() < 0.5:
142 |             img = cv2.flip(img, 1)
143 |             w = img.shape[1]
144 |             xmin = w - boxes[:,2]
145 |             xmax = w - boxes[:,0]
146 |             boxes[:,0] = xmin
147 |             boxes[:,2] = xmax
148 |         return img, boxes
149 | 
150 |     def random_crop(self, img, boxes, labels):
151 |         '''Randomly crop the image and adjust the bbox locations.
152 |         For more details, see 'Chapter2.2: Data augmentation' of the paper.
153 |         Args:
154 |           img: (ndarray.Image) image.
155 |           boxes: (tensor) bbox locations, sized [#obj, 4].
156 |           labels: (tensor) bbox labels, sized [#obj,].
157 |         Returns:
158 |           img: (ndarray.Image) cropped image.
159 |           selected_boxes: (tensor) selected bbox locations.
160 |           labels: (tensor) selected bbox labels.
161 |         '''
162 |         imw, imh = img.shape[1], img.shape[0]
163 |         while True:
164 |             min_iou = random.choice([None, 0.1, 0.3, 0.5, 0.7, 0.9])# random choice the one 
165 |             if min_iou is None:
166 |                 return img, boxes, labels
167 | 
168 |             for _ in range(100):
169 |                 w = random.randrange(int(0.1*imw), imw)
170 |                 h = random.randrange(int(0.1*imh), imh)
171 | 
172 |                 if h > 2*w or w > 2*h or h < 1 or w < 1:
173 |                     continue
174 | 
175 |                 x = random.randrange(imw - w)
176 |                 y = random.randrange(imh - h)
177 |                 roi = torch.Tensor([[x, y, x+w, y+h]])
178 |                 
179 |                 center = (boxes[:,:2] + boxes[:,2:]) / 2  # [N,2]
180 |                 roi2 = roi.expand(len(center), 4)  # [N,4]
181 |     
182 |                 mask = (center > roi2[:,:2]) & (center < roi2[:,2:])  # [N,2]
183 |                 mask = mask[:,0] & mask[:,1]  #[N,]
184 | 
185 |                 if not mask.any():
186 |                     continue
187 |               
188 |                 selected_boxes = boxes.index_select(0, mask.nonzero().squeeze(1))
189 |                 
190 |                 iou = self.data_encoder.iou(selected_boxes, roi)
191 |                 if iou.min() < min_iou:
192 |                     continue
193 |                 img = img[y:y+h, x:x+w, :]
194 |                 
195 |                 selected_boxes[:,0].add_(-x).clamp_(min=0, max=w)
196 |                 selected_boxes[:,1].add_(-y).clamp_(min=0, max=h)
197 |                 selected_boxes[:,2].add_(-x).clamp_(min=0, max=w)
198 |                 selected_boxes[:,3].add_(-y).clamp_(min=0, max=h)
199 | 
200 |                 return img, selected_boxes, labels[mask]
201 | 
202 |     def __len__(self):
203 |         return self.num_samples


--------------------------------------------------------------------------------
/CFENet/scripts/encoder.py:
--------------------------------------------------------------------------------
  1 | '''Encode target locations and labels.'''
  2 | import torch
  3 | 
  4 | import math
  5 | import itertools
  6 | 
  7 | class DataEncoder:
  8 |     def __init__(self):
  9 |         '''Compute default box sizes with scale and aspect transform.'''
 10 |         scale = 300.
 11 |         steps = [s / scale for s in (32, 64, 100, 300, 8, 16)]
 12 |         sizes = [s / scale for s in (111, 162, 213, 264, 315, 30, 60)]
 13 |         aspect_ratios = ((2,3), (2,3), (2,), (2,), (2,), (2,3))
 14 |         feature_map_sizes = (10, 5, 3, 1, 38, 19)
 15 |         num_layers = len(feature_map_sizes)
 16 | 
 17 |         boxes = []
 18 |         for i in range(num_layers):
 19 |             fmsize = feature_map_sizes[i] # feature map size 
 20 |             for h,w in itertools.product(range(fmsize), repeat=2):
 21 |                 cx = (w + 0.5)*steps[i]
 22 |                 cy = (h + 0.5)*steps[i]
 23 | 
 24 |                 s = sizes[i]
 25 |                 boxes.append((cx, cy, s, s))
 26 | 
 27 |                 s = math.sqrt(sizes[i] * sizes[i+1])
 28 |                 boxes.append((cx, cy, s, s))
 29 | 
 30 |                 s = sizes[i]
 31 |                 for ar in aspect_ratios[i]:
 32 |                     boxes.append((cx, cy, s * math.sqrt(ar), s / math.sqrt(ar)))
 33 |                     boxes.append((cx, cy, s / math.sqrt(ar), s * math.sqrt(ar)))
 34 | 
 35 |         self.default_boxes = torch.Tensor(boxes)
 36 | 
 37 |     def iou(self, box1, box2):
 38 |         '''Compute the intersection over union of two set of boxes, each box is [x1,y1,x2,y2].
 39 | 
 40 |         Args:
 41 |           box1: (tensor) bounding boxes, sized [N,4].
 42 |           box2: (tensor) bounding boxes, sized [M,4].
 43 | 
 44 |         Return:
 45 |           (tensor) iou, sized [N,M].
 46 |         '''
 47 |         N = box1.size(0)
 48 |         M = box2.size(0)
 49 | 
 50 |         lt = torch.max(
 51 |             box1[:,:2].unsqueeze(1).expand(N,M,2),  # [N,2] -> [N,1,2] -> [N,M,2]
 52 |             box2[:,:2].unsqueeze(0).expand(N,M,2),  # [M,2] -> [1,M,2] -> [N,M,2]
 53 |         )
 54 | 
 55 |         rb = torch.min(
 56 |             box1[:,2:].unsqueeze(1).expand(N,M,2),  # [N,2] -> [N,1,2] -> [N,M,2]
 57 |             box2[:,2:].unsqueeze(0).expand(N,M,2),  # [M,2] -> [1,M,2] -> [N,M,2]
 58 |         )
 59 | 
 60 |         wh = rb - lt  # [N,M,2]
 61 |         wh[wh<0] = 0  # clip at 0
 62 |         inter = wh[:,:,0] * wh[:,:,1]  # [N,M]
 63 | 
 64 |         area1 = (box1[:,2]-box1[:,0]) * (box1[:,3]-box1[:,1])  # [N,]
 65 |         area2 = (box2[:,2]-box2[:,0]) * (box2[:,3]-box2[:,1])  # [M,]
 66 |         area1 = area1.unsqueeze(1).expand_as(inter)  # [N,] -> [N,1] -> [N,M]
 67 |         area2 = area2.unsqueeze(0).expand_as(inter)  # [M,] -> [1,M] -> [N,M]
 68 | 
 69 |         iou = inter / (area1 + area2 - inter)
 70 |         return iou
 71 | 
 72 |     def encode(self, boxes, classes, threshold=0.5):
 73 |         '''Transform target bounding boxes and class labels to SSD boxes and classes.
 74 | 
 75 |         Match each object box to all the default boxes, pick the ones with the
 76 |         Jaccard-Index > 0.5:
 77 |             Jaccard(A,B) = AB / (A+B-AB)
 78 | 
 79 |         Args:
 80 |           boxes: (tensor) object bounding boxes (xmin,ymin,xmax,ymax) of a image, sized [#obj, 4].
 81 |           classes: (tensor) object class labels of a image, sized [#obj,].
 82 |           threshold: (float) Jaccard index threshold
 83 | 
 84 |         Returns:
 85 |           boxes: (tensor) bounding boxes, sized [#obj, 8732, 4].
 86 |           classes: (tensor) class labels, sized [8732,]
 87 |         '''
 88 |         default_boxes = self.default_boxes
 89 |         num_default_boxes = default_boxes.size(0)
 90 |         num_objs = boxes.size(0)
 91 | 
 92 |         iou = self.iou(  # [#obj,8732]
 93 |             boxes,
 94 |             torch.cat([default_boxes[:,:2] - default_boxes[:,2:]/2,
 95 |                        default_boxes[:,:2] + default_boxes[:,2:]/2], 1)
 96 |         )
 97 | 
 98 |         iou, max_idx = iou.max(0)  # [1,8732]
 99 |         max_idx.squeeze_(0)        # [8732,]
100 |         iou.squeeze_(0)            # [8732,]
101 | 
102 |         boxes = boxes[max_idx]     # [8732,4]
103 |         variances = [0.1, 0.2]
104 |         cxcy = (boxes[:,:2] + boxes[:,2:])/2 - default_boxes[:,:2]  # [8732,2]
105 |         cxcy /= variances[0] * default_boxes[:,2:]
106 |         wh = (boxes[:,2:] - boxes[:,:2]) / default_boxes[:,2:]      # [8732,2]
107 |         wh = torch.log(wh) / variances[1]
108 |         loc = torch.cat([cxcy, wh], 1)  # [8732,4]
109 | 
110 |         conf = 1 + classes[max_idx]   # [8732,], background class = 0
111 |         conf[iou<threshold] = 0       # background
112 |         return loc, conf
113 | 
114 |     def nms(self, bboxes, scores, threshold=0.3, mode='union'):
115 |         '''Non maximum suppression.
116 |         Args:
117 |           bboxes: (tensor) bounding boxes, sized [N,4].
118 |           scores: (tensor) bbox scores, sized [N,].
119 |           threshold: (float) overlap threshold.
120 |           mode: (str) 'union' or 'min'.
121 | 
122 |         Returns:
123 |           keep: (tensor) selected indices.
124 | 
125 |         Ref:
126 |           https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py
127 |         '''
128 |         x1 = bboxes[:,0]
129 |         y1 = bboxes[:,1]
130 |         x2 = bboxes[:,2]
131 |         y2 = bboxes[:,3]
132 | 
133 |         areas = (x2-x1) * (y2-y1)
134 |         _, order = scores.sort(0, descending=True)
135 |         order = order.numpy()
136 | 
137 |         keep = []
138 |         # while order.numel() > 0:
139 |         while order.size > 0: 
140 |             try:
141 |                 i = order[0]
142 |             except:
143 |                 i = order
144 |             keep.append(i)
145 | 
146 |             # if order.numel() == 1:
147 |             if order.size == 1:
148 |                 break
149 | 
150 |             xx1 = x1[order[1:]].clamp(min=x1[i])
151 |             yy1 = y1[order[1:]].clamp(min=y1[i])
152 |             xx2 = x2[order[1:]].clamp(max=x2[i])
153 |             yy2 = y2[order[1:]].clamp(max=y2[i])
154 | 
155 |             w = (xx2-xx1).clamp(min=0)
156 |             h = (yy2-yy1).clamp(min=0)
157 |             inter = w*h
158 | 
159 |             if mode == 'union':
160 |                 ovr = inter / (areas[i] + areas[order[1:]] - inter)
161 |             elif mode == 'min':
162 |                 ovr = inter / areas[order[1:]].clamp(max=areas[i])
163 |             else:
164 |                 raise TypeError('Unknown nms mode: %s.' % mode)
165 | 
166 |             ids = (ovr<=threshold).nonzero().squeeze()
167 |             # if ids.numel() == 0:
168 |             if order.size == 0:
169 |                 break
170 |             order = order[ids+1]
171 |         #     except:
172 |         #         print(order)
173 |         #         break
174 |         return torch.LongTensor(keep)
175 | 
176 |     def decode(self, loc, conf):
177 |         '''Transform predicted loc/conf back to real bbox locations and class labels.
178 | 
179 |         Args:
180 |           loc: (tensor) predicted loc, sized [8732,4].
181 |           conf: (tensor) predicted conf, sized [8732,21].
182 | 
183 |         Returns:
184 |           boxes: (tensor) bbox locations, sized [#obj, 4].
185 |           labels: (tensor) class labels, sized [#obj,1].
186 |         '''
187 |         variances = (0.1, 0.2)
188 |         # print(loc.size(), conf.size())
189 |         wh = torch.exp(loc[:,2:]*variances[1]) * self.default_boxes[:,2:]
190 |         cxcy = loc[:,:2] * variances[0] * self.default_boxes[:,2:] + self.default_boxes[:,:2]
191 |         box_preds = torch.cat([cxcy-wh/2, cxcy+wh/2], 1)  # [8732,4]
192 | 
193 |         boxes = []
194 |         labels = []
195 |         scores = []
196 |         num_classes = conf.size(1)
197 |         for i in range(num_classes-1):
198 |             score = conf[:,i+1]  # class i corresponds to (i+1) column
199 |             mask = score > 0.1
200 |             
201 |             if not mask.any():
202 |                 continue
203 |            
204 |             box = box_preds[mask.nonzero().squeeze()]
205 |             score = score[mask]
206 |             
207 |             if len(score) == 1:
208 |                 continue
209 | 
210 |             keep = self.nms(box, score, threshold=0.3)
211 |             boxes.append(box[keep])
212 |             labels.append(torch.LongTensor(len(box[keep])).fill_(i))
213 |             scores.append(score[keep])
214 | 
215 |         return boxes, labels, scores
216 | 


--------------------------------------------------------------------------------
/CFENet/scripts/multibox_layer.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import math
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.init as init
 8 | import torch.nn.functional as F
 9 | 
10 | 
11 | class MultiBoxLayer(nn.Module):
12 |     num_classes = 13
13 |     num_anchors = [6,6,4,4,4,6]
14 |     in_planes = [512,256,256,256,512,512]
15 | 
16 |     def __init__(self):
17 |         super(MultiBoxLayer, self).__init__()
18 | 
19 |         self.loc_layers = nn.ModuleList()
20 |         self.conf_layers = nn.ModuleList()
21 |         for i in range(len(self.in_planes)):
22 |         	self.loc_layers.append(nn.Conv2d(self.in_planes[i], self.num_anchors[i]*4, kernel_size=3, padding=1))
23 |         	self.conf_layers.append(nn.Conv2d(self.in_planes[i], self.num_anchors[i]*13, kernel_size=3, padding=1))
24 | 
25 |     def forward(self, xs):
26 |         '''
27 |         Args:
28 |           xs: (list) of tensor containing intermediate layer outputs.
29 | 
30 |         Returns:
31 |           loc_preds: (tensor) predicted locations, sized [N,8732,4].
32 |           conf_preds: (tensor) predicted class confidences, sized [N,8732,21].
33 |         '''
34 |         y_locs = []
35 |         y_confs = []
36 |         for i, x in enumerate(xs):
37 |             y_loc = self.loc_layers[i](x)
38 |             N = y_loc.size(0)
39 |             y_loc = y_loc.permute(0,2,3,1).contiguous()
40 |             y_loc = y_loc.view(N,-1,4)
41 |             y_locs.append(y_loc)
42 | 
43 |             y_conf = self.conf_layers[i](x)
44 |             y_conf = y_conf.permute(0,2,3,1).contiguous()
45 |             y_conf = y_conf.view(N,-1,13)
46 |             y_confs.append(y_conf)
47 | 
48 |         loc_preds = torch.cat(y_locs, 1)
49 |         conf_preds = torch.cat(y_confs, 1)
50 |         return loc_preds, conf_preds
51 | 


--------------------------------------------------------------------------------
/CFENet/scripts/multibox_loss.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import math
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.init as init
  8 | import torch.nn.functional as F
  9 | 
 10 | 
 11 | class MultiBoxLoss(nn.Module):
 12 |     num_classes = 13
 13 | 
 14 |     def __init__(self):
 15 |         super(MultiBoxLoss, self).__init__()
 16 | 
 17 |     def cross_entropy_loss(self, x, y):
 18 |         '''Cross entropy loss w/o averaging across all samples.
 19 | 
 20 |         Args:
 21 |           x: (tensor) sized [N,D].
 22 |           y: (tensor) sized [N,].
 23 | 
 24 |         Return:
 25 |           (tensor) cross entroy loss, sized [N,].
 26 |         '''
 27 |         xmax = x.data.max()
 28 |         print('x y size {} {}'.format(x.size(), y.size()))
 29 |         log_sum_exp = torch.log(torch.sum(torch.exp(x-xmax), 1)) + xmax
 30 |         print('log_sum_exp {}'.format(log_sum_exp.size()))
 31 |         return log_sum_exp - x.gather(1, y.view(-1,1))
 32 | 
 33 |     def hard_negative_mining(self, conf_loss, pos):
 34 |         '''Return negative indices that is 3x the number as postive indices.
 35 | 
 36 |         Args:
 37 |           conf_loss: (tensor) cross entroy loss between conf_preds and conf_targets, sized [N*8732,].
 38 |           pos: (tensor) positive(matched) box indices, sized [N,8732].
 39 | 
 40 |         Return:
 41 |           (tensor) negative indices, sized [N,8732].
 42 |         '''
 43 |         batch_size, num_boxes = pos.size()
 44 |         # print(pos.size())
 45 |         conf_loss[pos.view(-1)] = 0  # set pos boxes = 0, the rest are neg conf_loss
 46 |         conf_loss = conf_loss.view(batch_size, -1)  # [N,8732]
 47 | 
 48 |         _,idx = conf_loss.sort(1, descending=True)  # sort by neg conf_loss
 49 |         _,rank = idx.sort(1)  # [N,8732]
 50 | 
 51 |         num_pos = pos.long().sum(1)  # [N,1]
 52 |         num_neg = torch.clamp(3*num_pos, max=num_boxes-1)  # [N,1]
 53 | 
 54 |         neg = rank < num_neg.unsqueeze(1).expand_as(rank)
 55 | 
 56 |         return neg
 57 | 
 58 |     def forward(self, loc_preds, loc_targets, conf_preds, conf_targets):
 59 |         '''Compute loss between (loc_preds, loc_targets) and (conf_preds, conf_targets).
 60 | 
 61 |         Args:
 62 |           loc_preds: (tensor) predicted locations, sized [batch_size, 8732, 4].
 63 |           loc_targets: (tensor) encoded target locations, sized [batch_size, 8732, 4].
 64 |           conf_preds: (tensor) predicted class confidences, sized [batch_size, 8732, num_classes].
 65 |           conf_targets: (tensor) encoded target classes, sized [batch_size, 8732].
 66 | 
 67 |         loss:
 68 |           (tensor) loss = SmoothL1Loss(loc_preds, loc_targets) + CrossEntropyLoss(conf_preds, conf_targets).
 69 |         '''
 70 |         batch_size, num_boxes, _ = loc_preds.size()
 71 |         pos = conf_targets > 0  # [N,8732], pos means the box matched.
 72 |         num_matched_boxes = pos.data.float().sum()
 73 |         if num_matched_boxes == 0:
 74 |             return torch.tensor([0.], requires_grad=True)
 75 | 
 76 |         ################################################################
 77 |         # loc_loss = SmoothL1Loss(pos_loc_preds, pos_loc_targets)
 78 |         ################################################################
 79 |         pos_mask = pos.unsqueeze(2).expand_as(loc_preds)    # [N,8732,4]
 80 |         pos_loc_preds = loc_preds[pos_mask].view(-1,4)      # [#pos,4]
 81 |         pos_loc_targets = loc_targets[pos_mask].view(-1,4)  # [#pos,4]
 82 |         
 83 |         loc_loss = F.smooth_l1_loss(pos_loc_preds, pos_loc_targets, size_average=False)
 84 | 
 85 |         ################################################################
 86 |         # conf_loss = CrossEntropyLoss(pos_conf_preds, pos_conf_targets)
 87 |         #           + CrossEntropyLoss(neg_conf_preds, neg_conf_targets)
 88 |         ################################################################
 89 |         # print(conf_targets.view(-1).size())
 90 |         conf_loss = F.cross_entropy(conf_preds.view(-1,self.num_classes), \
 91 |                                             conf_targets.view(-1), reduce=False)  # [N*8732,]
 92 |         neg = self.hard_negative_mining(conf_loss, pos)    # [N,8732]
 93 | 
 94 |         pos_mask = pos.unsqueeze(2).expand_as(conf_preds)  # [N,8732,21]
 95 |         neg_mask = neg.unsqueeze(2).expand_as(conf_preds)  # [N,8732,21]
 96 |         mask = (pos_mask+neg_mask).gt(0)
 97 | 
 98 |         pos_and_neg = (pos+neg).gt(0)
 99 |         preds = conf_preds[mask].view(-1,self.num_classes)  # [#pos+#neg,21]
100 |         targets = conf_targets[pos_and_neg]                 # [#pos+#neg,]
101 |         conf_loss = F.cross_entropy(preds, targets, size_average=False)
102 | 
103 |         loc_loss /= num_matched_boxes
104 |         conf_loss /= num_matched_boxes
105 | 
106 |         return loc_loss + conf_loss
107 | 


--------------------------------------------------------------------------------
/CFENet/scripts/train.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import os
  4 | import argparse
  5 | import itertools
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.optim as optim
 10 | import torch.nn.functional as F
 11 | import torch.backends.cudnn as cudnn
 12 | 
 13 | import torchvision
 14 | import torchvision.transforms as transforms
 15 | 
 16 | import numpy as np
 17 | from cfenet import CFENet
 18 | from datagen import ListDataset
 19 | from multibox_loss import MultiBoxLoss
 20 | 
 21 | lr = 0.001
 22 | resume = False  # Resume from checkpoint
 23 | epoch = 200
 24 | 
 25 | use_cuda = torch.cuda.is_available()
 26 | best_loss = float('inf')  # best test loss
 27 | start_epoch = 0  # start from epoch 0 or last epoch
 28 | 
 29 | # Data
 30 | print('==> Preparing data..')
 31 | transform = transforms.Compose([transforms.ToTensor(),
 32 | 							transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))])
 33 | 
 34 | trainset = ListDataset(root='../../../../VisDrone2019/dataset/VisDrone2018-DET-train/images/', list_file='../../../../VisDrone2019/dataset/VisDrone2018-DET-train/annotations/', train=True, transform=transform)
 35 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=8, shuffle=True, num_workers=4)
 36 | 
 37 | valset = ListDataset(root='../../../../VisDrone2019/dataset/VisDrone2019-DET-val/images/', list_file='../../../../VisDrone2019/dataset/VisDrone2019-DET-val/annotations/', train=True, transform=transform)
 38 | valloader = torch.utils.data.DataLoader(valset, batch_size=8, shuffle=True, num_workers=4)
 39 | 
 40 | 
 41 | # Model
 42 | net = CFENet()
 43 | if resume:
 44 | 	print('==> Resuming from checkpoint..')
 45 | 	checkpoint = torch.load('./checkpoint/ckpt.pth')
 46 | 
 47 | 	keys = []
 48 | 	for k,v in checkpoint['net'].items():
 49 | 	    if "module" in k:
 50 | 	        keys.append(k)
 51 | 	for i in keys:
 52 | 	    checkpoint['net'][i[7:]] = checkpoint['net'][i]
 53 | 	    del checkpoint['net'][i]
 54 | 
 55 | 	net.load_state_dict(checkpoint['net'])
 56 | 	best_loss = checkpoint['loss']
 57 | 	start_epoch = checkpoint['epoch']
 58 | else:
 59 | 	# Convert from pretrained VGG model.
 60 | 	try:
 61 | 		net.load_state_dict(torch.load('../model/ssd.pth'))
 62 | 		print('==> Pretrain model read successfully')
 63 | 	except:
 64 | 		print('==> Pretrain model read failed or not existed, training from init')
 65 | 
 66 | 
 67 | criterion = MultiBoxLoss()
 68 | 
 69 | if use_cuda:
 70 | 	net = torch.nn.DataParallel(net, device_ids=[0])
 71 | 	net.cuda()
 72 | 	cudnn.benchmark = True
 73 | 
 74 | optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4)
 75 | 
 76 | # Training
 77 | def train(epoch, prev_val_loss, last_saved):
 78 | 	print('\nEpoch: %d' % epoch)
 79 | 	net.train()
 80 | 	train_loss = 0
 81 | 
 82 | 	for batch_idx, (images, loc_targets, conf_targets) in enumerate(trainloader):
 83 | 		if use_cuda:
 84 | 			images = images.cuda()
 85 | 			loc_targets = loc_targets.cuda()
 86 | 			conf_targets = conf_targets.cuda()
 87 | 
 88 | 		images = torch.tensor(images)
 89 | 		loc_targets = torch.tensor(loc_targets)
 90 | 		conf_targets = torch.tensor(conf_targets)
 91 | 
 92 | 		optimizer.zero_grad()
 93 | 		loc_preds, conf_preds = net(images)
 94 | 		loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets)
 95 | 		loss.backward()
 96 | 		optimizer.step()
 97 | 
 98 | 		train_loss += loss.item()
 99 | 		if batch_idx%100 == 0:
100 | 			val_loss_tot = 0
101 | 			for batch_idx_val, (images, loc_targets, conf_targets) in enumerate(valloader):
102 | 				if use_cuda:
103 | 					images = images.cuda()
104 | 					loc_targets = loc_targets.cuda()
105 | 					conf_targets = conf_targets.cuda()
106 | 
107 | 				images = torch.tensor(images)
108 | 				loc_targets = torch.tensor(loc_targets)
109 | 				conf_targets = torch.tensor(conf_targets)
110 | 
111 | 				loc_preds, conf_preds = net(images)
112 | 				val_loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets)
113 | 				val_loss_tot += val_loss.item()
114 | 
115 | 			val_loss_tot /= (batch_idx_val+1)
116 | 			if val_loss_tot < prev_val_loss:
117 | 				os.makedirs('checkpoint', exist_ok=True)
118 | 				torch.save({
119 | 					'epoch': epoch,
120 | 					'net': net.state_dict(), 
121 | 					'loss': loss,
122 | 				}, 'checkpoint/ckpt.pth')
123 | 				print("Saved.")
124 | 				prev_val_loss = val_loss_tot
125 | 				last_saved = [epoch, batch_idx]
126 | 		print('epoch: {}, batch_idx: {},loss: {}, train_loss: {}, best_val_loss: {}, last_saved: {}'.format(epoch, batch_idx, loss.item(), train_loss/(batch_idx+1), prev_val_loss, last_saved))
127 | 
128 | 	return prev_val_loss, last_saved
129 | 
130 | prev_val_loss = 999
131 | last_saved = [start_epoch,0]
132 | for epoch_num in range(start_epoch, start_epoch+epoch):
133 | 	prev_val_loss, last_saved = train(epoch_num, prev_val_loss, last_saved)
134 | 	


--------------------------------------------------------------------------------
/CFENet/scripts/util.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import torch.nn.init as init
 5 | 
 6 | from torch.autograd import Variable
 7 | 
 8 | class CFE(nn.Module):
 9 | 	def __init__(self, channels, k):
10 | 		super(CFE, self).__init__()
11 | 
12 | 		# First branch
13 | 		self.conv1_1 = nn.Conv2d(channels, channels//2, kernel_size=1, padding=0)
14 | 		self.bn1 = nn.BatchNorm2d(channels//2)
15 | 		self.conv1_2 = nn.Conv2d(channels//2,channels//2, kernel_size=(k,1), padding=(k//2,0),groups=8)
16 | 		self.conv1_3 = nn.Conv2d(channels//2, channels//2, kernel_size=(1,k), padding=(0,k//2),groups=8)
17 | 		self.bn2 = nn.BatchNorm2d(channels//2)
18 | 		self.conv1_4 = nn.Conv2d(channels//2, channels//2, kernel_size=1, padding=0)
19 | 		self.bn3 = nn.BatchNorm2d(channels//2)
20 | 
21 | 		# Second branch
22 | 		self.conv2_1 = nn.Conv2d(channels, channels//2, kernel_size=1, padding=0)
23 | 		self.bn4 = nn.BatchNorm2d(channels//2)
24 | 		self.conv2_2 = nn.Conv2d(channels//2,channels//2, kernel_size=(1,k), padding=(0,k//2),groups=8)
25 | 		self.conv2_3 = nn.Conv2d(channels//2, channels//2, kernel_size=(k,1), padding=(k//2,0),groups=8)
26 | 		self.bn5 = nn.BatchNorm2d(channels//2)
27 | 		self.conv2_4 = nn.Conv2d(channels//2, channels//2, kernel_size=1, padding=0)
28 | 		self.bn6 = nn.BatchNorm2d(channels//2)
29 | 
30 | 	def forward(self, x):
31 | 
32 | 		# First branch
33 | 		f = self.bn1(F.relu(self.conv1_1(x)))
34 | 		# print(f.size())
35 | 		f = self.conv1_2(f)
36 | 		# print(f.size())
37 | 		f = self.bn2(F.relu(self.conv1_3(f)))
38 | 		# print(f.size())
39 | 		f = self.bn3(F.relu(self.conv1_4(f)))
40 | 		# print(f.size())
41 | 		
42 | 		# Second branch
43 | 		s = self.bn4(F.relu(self.conv2_1(x)))
44 | 		# print(s.size())
45 | 		s = self.conv2_2(s)
46 | 		# print(s.size())
47 | 		s = self.bn5(F.relu(self.conv2_3(s)))
48 | 		# print(s.size())
49 | 		s = self.bn6(F.relu(self.conv2_4(s)))
50 | 		# print(s.size())
51 | 		
52 | 		fs = torch.cat((f,s), 1)
53 | 		# print(fs.size())
54 | 		
55 | 		return (fs+x)
56 | 
57 | class FFB(nn.Module):
58 | 	def __init__(self, c1, c2):
59 | 		super(FFB, self).__init__()
60 | 
61 | 		self.conv1 = nn.Conv2d(c1, c1, kernel_size=1, padding=0)
62 | 		self.bn1 = nn.BatchNorm2d(c1)
63 | 		self.conv2 = nn.Conv2d(c2, c1, kernel_size=1, padding=0)
64 | 		self.bn2 = nn.BatchNorm2d(c1)
65 | 		self.deconv1 = nn.ConvTranspose2d(c1,c1, kernel_size=3, stride=2, padding=(1,1))
66 | 
67 | 	def forward(self, x1, x2):
68 | 
69 | 		f = self.bn1(F.relu(self.conv1(x1)))
70 | 		# print(f.size())
71 | 		s = self.bn2(F.relu(self.conv2(x2)))
72 | 		# print(s.size())
73 | 		# s = F.upsample(s, scale_factor = 2)
74 | 		s = self.deconv1(s)
75 | 		# print(s.size())
76 | 
77 | 		# return(f+s) 
78 | 		return f
79 | 
80 | if __name__ == "__main__":
81 | 	x1 = torch.rand(1,512,38,38)        
82 | 	x2 = torch.rand(1,1024,19,19)        
83 | 
84 | 	# model = CFE(512,3)
85 | 	model = FFB(512,1024)
86 | 	# x = model(x1)
87 | 	x = model(x1,x2)	
88 | 	print(x.size())
89 | 
90 | 
91 | 	x1 = torch.rand(1,512,19,19)        
92 | 	x2 = torch.rand(1,1024,10,10)        
93 | 
94 | 	model = FFB(512,1024)
95 | 	x = model(x1,x2)	
96 | 	print(x.size())
97 | 


--------------------------------------------------------------------------------
/MDSSD_300/scripts/datagen.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Load image/class/box from a annotation file.
  3 | 
  4 | '''
  5 | from __future__ import print_function
  6 | 
  7 | import os
  8 | import sys
  9 | import os.path
 10 | 
 11 | import random
 12 | import numpy as np
 13 | 
 14 | import torch
 15 | import torch.utils.data as data
 16 | import torchvision.transforms as transforms
 17 | 
 18 | from encoder import DataEncoder
 19 | import cv2
 20 | 
 21 | import pandas as pd
 22 | import shutil
 23 | import os
 24 | import numpy as np
 25 | import glob
 26 | import xml.etree.ElementTree as ET
 27 | from xml.dom import minidom
 28 | 
 29 | class ListDataset(data.Dataset):
 30 |     img_size = 300
 31 | 
 32 |     def __init__(self, root, list_file, train, transform):
 33 |         '''
 34 |         Args:
 35 |           root: (str) ditectory to images.
 36 |           list_file: (str) path to annotation files.
 37 |           train: (boolean) train or test.
 38 |           transform: ([transforms]) image transforms.
 39 |         '''
 40 |         self.root = root
 41 |         self.train = train
 42 |         self.transform = transform
 43 | 
 44 |         self.fnames = []
 45 |         self.boxes = []
 46 |         self.labels = []
 47 | 
 48 |         self.data_encoder = DataEncoder()
 49 |         self.num_samples = 0
 50 | 
 51 |         # VisDrone
 52 | 
 53 |         for i in os.listdir(list_file):
 54 |             self.num_samples += 1
 55 |             self.fnames.append(i)
 56 |             box = []
 57 |             labels = []
 58 |             with open(os.path.join(list_file,i)) as f:
 59 |                 f = f.read().split("\n")
 60 |                 f = f[:-1]
 61 |             num_objs = len(f)
 62 | 
 63 |             for j in range(num_objs):
 64 |                 f[j] = f[j].split(",")
 65 |                 xmin = float(f[j][0])
 66 |                 ymin = float(f[j][1])
 67 |                 w = float(f[j][2])
 68 |                 h = float(f[j][3])
 69 | 
 70 |                 box.append([xmin,ymin,xmin+h,ymin+h])
 71 |                 labels.append(int(f[j][5]))
 72 |         
 73 |             self.boxes.append(torch.Tensor(box))
 74 |             self.labels.append(torch.LongTensor(labels))
 75 |         
 76 | 
 77 |     def __getitem__(self, idx):
 78 |         '''Load a image, and encode its bbox locations and class labels.
 79 |         Args:
 80 |           idx: (int) image index.
 81 |         Returns:
 82 |           img: (tensor) image tensor.
 83 |           loc_target: (tensor) location targets, sized [8732,4].
 84 |           conf_target: (tensor) label targets, sized [8732,].
 85 |         '''
 86 |         # Load image and bbox locations.
 87 |         fname = self.fnames[idx]
 88 |         img = cv2.imread(os.path.join(self.root, fname[:-4]+".jpg"))
 89 |         boxes = self.boxes[idx].clone()
 90 |         labels = self.labels[idx]
 91 | 
 92 |         # Data augmentation while training.
 93 |         if self.train:
 94 |             img, boxes = self.random_flip(img, boxes)
 95 |             img, boxes, labels = self.random_crop(img, boxes, labels)
 96 | 
 97 |         # Scale bbox locaitons to [0,1].
 98 |         w,h = img.shape[1], img.shape[0]
 99 |         boxes /= torch.Tensor([w,h,w,h]).expand_as(boxes)
100 |         img = cv2.resize(img, (self.img_size,self.img_size))
101 |         img = self.transform(img)
102 | 
103 |         # Encode loc & conf targets.
104 |         
105 |         loc_target, conf_target = self.data_encoder.encode(boxes, labels)
106 |         return img, loc_target, conf_target
107 | 
108 |     def random_flip(self, img, boxes):
109 |         '''Randomly flip the image and adjust the bbox locations.
110 |         For bbox (xmin, ymin, xmax, ymax), the flipped bbox is:
111 |         (w-xmax, ymin, w-xmin, ymax).
112 |         Args:
113 |           img: (ndarray.Image) image. f
114 |           boxes: (tensor) bbox locations, sized [#obj, 4].
115 |         Returns:
116 |           img: (ndarray.Image) randomly flipped image.
117 |           boxes: (tensor) randomly flipped bbox locations, sized [#obj, 4].
118 |         '''
119 |         if random.random() < 0.5:
120 |             img = cv2.flip(img, 1)
121 |             w = img.shape[1]
122 |             xmin = w - boxes[:,2]
123 |             xmax = w - boxes[:,0]
124 |             boxes[:,0] = xmin
125 |             boxes[:,2] = xmax
126 |         return img, boxes
127 | 
128 |     def random_crop(self, img, boxes, labels):
129 |         '''Randomly crop the image and adjust the bbox locations.
130 |         For more details, see 'Chapter2.2: Data augmentation' of the paper.
131 |         Args:
132 |           img: (ndarray.Image) image.
133 |           boxes: (tensor) bbox locations, sized [#obj, 4].
134 |           labels: (tensor) bbox labels, sized [#obj,].
135 |         Returns:
136 |           img: (ndarray.Image) cropped image.
137 |           selected_boxes: (tensor) selected bbox locations.
138 |           labels: (tensor) selected bbox labels.
139 |         '''
140 |         imw, imh = img.shape[1], img.shape[0]
141 |         while True:
142 |             min_iou = random.choice([None, 0.1, 0.3, 0.5, 0.7, 0.9])# random choice the one 
143 |             if min_iou is None:
144 |                 return img, boxes, labels
145 | 
146 |             for _ in range(100):
147 |                 w = random.randrange(int(0.1*imw), imw)
148 |                 h = random.randrange(int(0.1*imh), imh)
149 | 
150 |                 if h > 2*w or w > 2*h or h < 1 or w < 1:
151 |                     continue
152 | 
153 |                 x = random.randrange(imw - w)
154 |                 y = random.randrange(imh - h)
155 |                 roi = torch.Tensor([[x, y, x+w, y+h]])
156 |                 
157 |                 center = (boxes[:,:2] + boxes[:,2:]) / 2  # [N,2]
158 |                 roi2 = roi.expand(len(center), 4)  # [N,4]
159 |     
160 |                 mask = (center > roi2[:,:2]) & (center < roi2[:,2:])  # [N,2]
161 |                 mask = mask[:,0] & mask[:,1]  #[N,]
162 | 
163 |                 if not mask.any():
164 |                     continue
165 |               
166 |                 selected_boxes = boxes.index_select(0, mask.nonzero().squeeze(1))
167 |                 
168 |                 iou = self.data_encoder.iou(selected_boxes, roi)
169 |                 if iou.min() < min_iou:
170 |                     continue
171 |                 img = img[y:y+h, x:x+w, :]
172 |                 
173 |                 selected_boxes[:,0].add_(-x).clamp_(min=0, max=w)
174 |                 selected_boxes[:,1].add_(-y).clamp_(min=0, max=h)
175 |                 selected_boxes[:,2].add_(-x).clamp_(min=0, max=w)
176 |                 selected_boxes[:,3].add_(-y).clamp_(min=0, max=h)
177 | 
178 |                 return img, selected_boxes, labels[mask]
179 | 
180 |     def __len__(self):
181 |         return self.num_samples


--------------------------------------------------------------------------------
/MDSSD_300/scripts/encoder.py:
--------------------------------------------------------------------------------
  1 | '''Encode target locations and labels.'''
  2 | import torch
  3 | 
  4 | import math
  5 | import itertools
  6 | 
  7 | class DataEncoder:
  8 |     def __init__(self):
  9 |         '''Compute default box sizes with scale and aspect transform.'''
 10 |         scale = 300.
 11 |         steps = [s / scale for s in (32, 64, 100, 300, 4, 8, 16)]
 12 |         sizes_ssd = [s / scale for s in (111, 162, 213, 264, 315)]
 13 |         sizes_fusion = [s / scale for s in (15, 30, 60, 111)]
 14 |         aspect_ratios = ((2,3), (2,3), (2,), (2,), (2,), (2,), (2,))
 15 |         feature_map_sizes = (10, 5, 3, 1, 75, 38, 19)
 16 |         num_layers = len(feature_map_sizes)
 17 | 
 18 |         boxes = []
 19 |         for i in range(num_layers):
 20 |             fmsize = feature_map_sizes[i] # feature map size 
 21 |             for h,w in itertools.product(range(fmsize), repeat=2):
 22 |                 cx = (w + 0.5)*steps[i]
 23 |                 cy = (h + 0.5)*steps[i]
 24 | 
 25 |                 if i<4:
 26 |                     s = sizes_ssd[i]
 27 |                     boxes.append((cx, cy, s, s))
 28 | 
 29 |                     s = math.sqrt(sizes_ssd[i] * sizes_ssd[i+1])
 30 |                     boxes.append((cx, cy, s, s))
 31 | 
 32 |                     s = sizes_ssd[i]
 33 |                     for ar in aspect_ratios[i]:
 34 |                         boxes.append((cx, cy, s * math.sqrt(ar), s / math.sqrt(ar)))
 35 |                         boxes.append((cx, cy, s / math.sqrt(ar), s * math.sqrt(ar)))
 36 | 
 37 |                 else:            
 38 |                     s = sizes_fusion[i-4]
 39 |                     boxes.append((cx, cy, s, s))
 40 | 
 41 |                     s = math.sqrt(sizes_fusion[i-4] * sizes_fusion[i-4+1])
 42 |                     boxes.append((cx, cy, s, s))
 43 | 
 44 |                     s = sizes_fusion[i-4]
 45 |                     for ar in aspect_ratios[i]:
 46 |                         boxes.append((cx, cy, s * math.sqrt(ar), s / math.sqrt(ar)))
 47 |                         boxes.append((cx, cy, s / math.sqrt(ar), s * math.sqrt(ar)))
 48 | 
 49 |         self.default_boxes = torch.Tensor(boxes)
 50 | 
 51 |     def iou(self, box1, box2):
 52 |         '''Compute the intersection over union of two set of boxes, each box is [x1,y1,x2,y2].
 53 | 
 54 |         Args:
 55 |           box1: (tensor) bounding boxes, sized [N,4].
 56 |           box2: (tensor) bounding boxes, sized [M,4].
 57 | 
 58 |         Return:
 59 |           (tensor) iou, sized [N,M].
 60 |         '''
 61 |         N = box1.size(0)
 62 |         M = box2.size(0)
 63 | 
 64 |         lt = torch.max(
 65 |             box1[:,:2].unsqueeze(1).expand(N,M,2),  # [N,2] -> [N,1,2] -> [N,M,2]
 66 |             box2[:,:2].unsqueeze(0).expand(N,M,2),  # [M,2] -> [1,M,2] -> [N,M,2]
 67 |         )
 68 | 
 69 |         rb = torch.min(
 70 |             box1[:,2:].unsqueeze(1).expand(N,M,2),  # [N,2] -> [N,1,2] -> [N,M,2]
 71 |             box2[:,2:].unsqueeze(0).expand(N,M,2),  # [M,2] -> [1,M,2] -> [N,M,2]
 72 |         )
 73 | 
 74 |         wh = rb - lt  # [N,M,2]
 75 |         wh[wh<0] = 0  # clip at 0
 76 |         inter = wh[:,:,0] * wh[:,:,1]  # [N,M]
 77 | 
 78 |         area1 = (box1[:,2]-box1[:,0]) * (box1[:,3]-box1[:,1])  # [N,]
 79 |         area2 = (box2[:,2]-box2[:,0]) * (box2[:,3]-box2[:,1])  # [M,]
 80 |         area1 = area1.unsqueeze(1).expand_as(inter)  # [N,] -> [N,1] -> [N,M]
 81 |         area2 = area2.unsqueeze(0).expand_as(inter)  # [M,] -> [1,M] -> [N,M]
 82 | 
 83 |         iou = inter / (area1 + area2 - inter)
 84 |         return iou
 85 | 
 86 |     def encode(self, boxes, classes, threshold=0.5):
 87 |         '''Transform target bounding boxes and class labels to SSD boxes and classes.
 88 | 
 89 |         Match each object box to all the default boxes, pick the ones with the
 90 |         Jaccard-Index > 0.5:
 91 |             Jaccard(A,B) = AB / (A+B-AB)
 92 | 
 93 |         Args:
 94 |           boxes: (tensor) object bounding boxes (xmin,ymin,xmax,ymax) of a image, sized [#obj, 4].
 95 |           classes: (tensor) object class labels of a image, sized [#obj,].
 96 |           threshold: (float) Jaccard index threshold
 97 | 
 98 |         Returns:
 99 |           boxes: (tensor) bounding boxes, sized [#obj, 8732, 4].
100 |           classes: (tensor) class labels, sized [8732,]
101 |         '''
102 |         default_boxes = self.default_boxes
103 |         num_default_boxes = default_boxes.size(0)
104 |         num_objs = boxes.size(0)
105 | 
106 |         iou = self.iou(  # [#obj,8732]
107 |             boxes,
108 |             torch.cat([default_boxes[:,:2] - default_boxes[:,2:]/2,
109 |                        default_boxes[:,:2] + default_boxes[:,2:]/2], 1)
110 |         )
111 | 
112 |         iou, max_idx = iou.max(0)  # [1,8732]
113 |         max_idx.squeeze_(0)        # [8732,]
114 |         iou.squeeze_(0)            # [8732,]
115 | 
116 |         boxes = boxes[max_idx]     # [8732,4]
117 |         variances = [0.1, 0.2]
118 |         cxcy = (boxes[:,:2] + boxes[:,2:])/2 - default_boxes[:,:2]  # [8732,2]
119 |         cxcy /= variances[0] * default_boxes[:,2:]
120 |         wh = (boxes[:,2:] - boxes[:,:2]) / default_boxes[:,2:]      # [8732,2]
121 |         wh = torch.log(wh) / variances[1]
122 |         loc = torch.cat([cxcy, wh], 1)  # [8732,4]
123 | 
124 |         conf = 1 + classes[max_idx]   # [8732,], background class = 0
125 |         conf[iou<threshold] = 0       # background
126 |         return loc, conf
127 | 
128 |     def nms(self, bboxes, scores, threshold=0.3, mode='union'):
129 |         '''Non maximum suppression.
130 |         Args:
131 |           bboxes: (tensor) bounding boxes, sized [N,4].
132 |           scores: (tensor) bbox scores, sized [N,].
133 |           threshold: (float) overlap threshold.
134 |           mode: (str) 'union' or 'min'.
135 | 
136 |         Returns:
137 |           keep: (tensor) selected indices.
138 | 
139 |         Ref:
140 |           https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py
141 |         '''
142 |         x1 = bboxes[:,0]
143 |         y1 = bboxes[:,1]
144 |         x2 = bboxes[:,2]
145 |         y2 = bboxes[:,3]
146 | 
147 |         areas = (x2-x1) * (y2-y1)
148 |         _, order = scores.sort(0, descending=True)
149 |         order = order.numpy()
150 | 
151 |         keep = []
152 |         while order.size > 0: 
153 |             try:
154 |                 i = order[0]
155 |             except:
156 |                 i = order
157 |             keep.append(i)
158 | 
159 |             if order.size == 1:
160 |                 break
161 | 
162 |             xx1 = x1[order[1:]].clamp(min=x1[i])
163 |             yy1 = y1[order[1:]].clamp(min=y1[i])
164 |             xx2 = x2[order[1:]].clamp(max=x2[i])
165 |             yy2 = y2[order[1:]].clamp(max=y2[i])
166 | 
167 |             w = (xx2-xx1).clamp(min=0)
168 |             h = (yy2-yy1).clamp(min=0)
169 |             inter = w*h
170 | 
171 |             if mode == 'union':
172 |                 ovr = inter / (areas[i] + areas[order[1:]] - inter)
173 |             elif mode == 'min':
174 |                 ovr = inter / areas[order[1:]].clamp(max=areas[i])
175 |             else:
176 |                 raise TypeError('Unknown nms mode: %s.' % mode)
177 | 
178 |             ids = (ovr<=threshold).nonzero().squeeze()
179 |             if ids.size == 0:
180 |                 break
181 |             order = order[ids+1]
182 |         return torch.LongTensor(keep)
183 | 
184 |     def decode(self, loc, conf):
185 |         '''Transform predicted loc/conf back to real bbox locations and class labels.
186 | 
187 |         Args:
188 |           loc: (tensor) predicted loc, sized [8732,4].
189 |           conf: (tensor) predicted conf, sized [8732,21].
190 | 
191 |         Returns:
192 |           boxes: (tensor) bbox locations, sized [#obj, 4].
193 |           labels: (tensor) class labels, sized [#obj,1].
194 |         '''
195 |         variances = (0.1, 0.2)
196 |         wh = torch.exp(loc[:,2:]*variances[1]) * self.default_boxes[:,2:]
197 |         cxcy = loc[:,:2] * variances[0] * self.default_boxes[:,2:] + self.default_boxes[:,:2]
198 |         box_preds = torch.cat([cxcy-wh/2, cxcy+wh/2], 1)  # [8732,4]
199 | 
200 |         boxes = []
201 |         labels = []
202 |         scores = []
203 |         num_classes = conf.size(1)
204 |         for i in range(num_classes-1):
205 |             score = conf[:,i+1]  # class i corresponds to (i+1) column
206 |             mask = score > 0.1
207 |             
208 |             if not mask.any():
209 |                 continue
210 |            
211 |             box = box_preds[mask.nonzero().squeeze()]
212 |             score = score[mask]
213 |             
214 |             if len(score) == 1:
215 |                 continue
216 |             keep = self.nms(box, score, threshold=0.3)
217 |             boxes.append(box[keep])
218 |             labels.append(torch.LongTensor(len(box[keep])).fill_(i))
219 |             scores.append(score[keep])
220 | 
221 |         return boxes, labels, scores
222 | 


--------------------------------------------------------------------------------
/MDSSD_300/scripts/fusion.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import torch.nn.init as init
 5 | 
 6 | from torch.autograd import Variable
 7 | 
 8 | from norm import L2Norm
 9 | 
10 | class FusionBlock(nn.Module):
11 |     def __init__(self, big_features, small_features):
12 |         super(FusionBlock, self).__init__()
13 |         
14 |         # Bigger feature map
15 |         self.conv1_1 = nn.Conv2d(big_features, 256, kernel_size=3, padding=1, dilation=1)
16 |         self.Norm1 = L2Norm(256, 20)
17 | 
18 |         # Smaller feature map
19 |         self.deconv2_1 = nn.ConvTranspose2d(small_features, 256, 2, stride=2, dilation=1)
20 |         self.conv2_1 = nn.Conv2d(256, 256, kernel_size=3, padding=1, dilation=1)
21 |         self.deconv2_2 = nn.ConvTranspose2d(256, 256, 2, stride=2, dilation=1)
22 |         self.conv2_2 = nn.Conv2d(256, 256, kernel_size=3, padding=1, dilation=1)
23 |         self.deconv2_3 = nn.ConvTranspose2d(256, 256, 3, stride=2, dilation=1)
24 |         self.conv2_3 = nn.Conv2d(256, 256, kernel_size=3, padding=1, dilation=1)
25 |         self.Norm2 = L2Norm(256, 20)
26 | 
27 |         # Common
28 |         self.conv3_1 = nn.Conv2d(256, big_features, kernel_size=3, padding=1, dilation=1)
29 | 
30 |         
31 |     def forward(self, big, small):
32 |         h1 = self.conv1_1(big)
33 |         h1 = self.Norm1(h1)
34 | 
35 |         h2 = self.deconv2_1(small)
36 |         # print(h2.size())
37 |         h2 = F.relu(self.conv2_1(h2))
38 |         # print(h2.size())
39 |         h2 = self.deconv2_2(h2)
40 |         # print(h2.size())
41 |         h2 = F.relu(self.conv2_2(h2))
42 |         # print(h2.size())
43 |         h2 = self.deconv2_3(h2)
44 |         # print(h2.size())
45 |         h2 = self.conv2_3(h2)
46 |         # print(h2.size())
47 |         h2 = self.Norm2(h2)
48 | 
49 |         size = h2.size()[3]
50 |         diff_odd = h2.size()[-1] - h1.size()[-1]
51 |         h2 = h2[:,:,(int(diff_odd/2)+diff_odd%2):(size-int(diff_odd/2)),(int(diff_odd/2)+diff_odd%2):(size-int(diff_odd/2))]
52 | 
53 |         # print(h1.size(), h2.size())
54 |         h = F.relu(h1+h2)
55 |         h = F.relu(self.conv3_1(h))
56 | 
57 |         return h 
58 | 
59 | if __name__ == '__main__':
60 |     big = torch.randn(1, 256, 128, 128)
61 |     small = torch.rand(1,512,16,16)
62 |     net = FusionBlock(256,512)
63 | 
64 |                
65 |         
66 | 


--------------------------------------------------------------------------------
/MDSSD_300/scripts/mdssd.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import itertools
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | import torch.nn.init as init
  8 | 
  9 | from torch.autograd import Variable
 10 | 
 11 | from multibox_layer import MultiBoxLayer
 12 | from fusion import FusionBlock
 13 | from norm import L2Norm
 14 | 
 15 | class MDSSD300(nn.Module):
 16 |     input_size = 300
 17 | 
 18 |     def __init__(self):
 19 |         super(MDSSD300, self).__init__()
 20 | 		
 21 | 		# model
 22 |         self.base = self.VGG16()
 23 |         self.norm4 = L2Norm(512, 20) # 38
 24 | 
 25 |         self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1)
 26 |         self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1)
 27 |         self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1)
 28 | 
 29 |         self.conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
 30 | 
 31 |         self.conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 
 32 | 
 33 |         self.conv8_1 = nn.Conv2d(1024, 256, kernel_size=1)
 34 |         self.conv8_2 = nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2)
 35 | 
 36 |         self.conv9_1 = nn.Conv2d(512, 128, kernel_size=1)
 37 |         self.conv9_2 = nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2)
 38 | 
 39 |         self.conv10_1 = nn.Conv2d(256, 128, kernel_size=1)
 40 |         self.conv10_2 = nn.Conv2d(128, 256, kernel_size=3)
 41 | 
 42 |         self.conv11_1 = nn.Conv2d(256, 128, kernel_size=1)
 43 |         self.conv11_2 = nn.Conv2d(128, 256, kernel_size=3)
 44 | 
 45 |         self.Fusion1 = FusionBlock(256,512)
 46 |         self.Fusion2 = FusionBlock(512,256)
 47 |         self.Fusion3 = FusionBlock(1024,256)
 48 | 
 49 |         # multibox layer
 50 |         self.multibox = MultiBoxLayer()
 51 | 
 52 |     def forward(self, x):
 53 |         odd_count = 0
 54 |         odd = []
 55 |         hs = []
 56 |         vgg = []
 57 |         fusion_layers = []
 58 |         h = self.base[0](x)
 59 |         vgg.append(h)
 60 |         for i in range(1,len(self.base)):
 61 |             h = self.base[i](h)
 62 |             vgg.append(h)
 63 |         fusion_layers.append(vgg[15])
 64 |         odd.append(2)
 65 |         odd_count = 3
 66 |         fusion_layers.append(h)
 67 |         h = F.max_pool2d(h, kernel_size=2, stride=2, ceil_mode=True)
 68 | 
 69 |         h = F.relu(self.conv5_1(h))
 70 |         h = F.relu(self.conv5_2(h))
 71 |         h = F.relu(self.conv5_3(h))
 72 |         h = F.max_pool2d(h, kernel_size=3, padding=1, stride=1, ceil_mode=True)
 73 |         
 74 |         h = F.relu(self.conv6(h))
 75 |         h = F.relu(self.conv7(h))
 76 |         fusion_layers.append(h)
 77 | 
 78 |         h = F.relu(self.conv8_1(h))
 79 |         h = F.relu(self.conv8_2(h))
 80 |         hs.append(h)  # conv8_2
 81 | 
 82 |         h = F.relu(self.conv9_1(h))
 83 |         h = F.relu(self.conv9_2(h))
 84 |         hs.append(h)  # conv9_2
 85 | 
 86 |         h = F.relu(self.conv10_1(h))
 87 |         h = F.relu(self.conv10_2(h))
 88 |         hs.append(h)  # conv10_2
 89 | 
 90 |         h = F.relu(self.conv11_1(h))
 91 |         h = F.relu(self.conv11_2(h))
 92 |         hs.append(h)  # conv11_2
 93 | 
 94 |         # Fusion Blocks
 95 |         f = self.Fusion1(fusion_layers[0],hs[-4])
 96 |         hs.append(f)
 97 |         f = self.Fusion2(fusion_layers[1],hs[-4])
 98 |         hs.append(f)
 99 |         diff_odd = fusion_layers[2].size()[-1] - hs[-4].size()[-1]
100 |         f = self.Fusion3(fusion_layers[2],hs[-4])
101 |         hs.append(f)
102 | 
103 |         loc_preds, conf_preds = self.multibox(hs)
104 |  
105 |         return loc_preds, conf_preds
106 | 
107 |     def VGG16(self):
108 |         '''VGG16 layers.'''
109 |         cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512]
110 |         layers = []
111 |         in_channels = 3
112 |         for x in cfg:
113 |             if x == 'M':
114 |                 layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
115 |             else:
116 |                 layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
117 |                            nn.ReLU(True)]
118 |                 in_channels = x
119 |         return nn.Sequential(*layers)
120 | 
121 | if __name__ == '__main__':
122 |     t = torch.randn(1, 3, 300, 300)
123 |     net = MDSSD300()
124 |     # print(net)
125 |     res = net.forward(t)
126 |     


--------------------------------------------------------------------------------
/MDSSD_300/scripts/multibox_layer.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import math
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.init as init
 8 | import torch.nn.functional as F
 9 | 
10 | 
11 | class MultiBoxLayer(nn.Module):
12 | 	num_classes = 13
13 | 	num_anchors = [6,6,4,4,4,4,4]
14 | 	
15 | 	def __init__(self):
16 | 		super(MultiBoxLayer, self).__init__()
17 | 		self.in_planes = [512,256,256,256,256,512,1024]
18 | 		self.loc_layers = nn.ModuleList()
19 | 		self.conf_layers = nn.ModuleList()
20 | 		for i in range(len(self.in_planes)):
21 | 			self.loc_layers.append(nn.Conv2d(self.in_planes[i], self.num_anchors[i]*4, kernel_size=3, padding=1))
22 | 			self.conf_layers.append(nn.Conv2d(self.in_planes[i], self.num_anchors[i]*13, kernel_size=3, padding=1))
23 | 
24 | 	def forward(self, xs):
25 | 		'''
26 | 		Args:
27 | 		  xs: (list) of tensor containing intermediate layer outputs.
28 | 
29 | 		Returns:
30 | 		  loc_preds: (tensor) predicted locations, sized [N,8732,4].
31 | 		  conf_preds: (tensor) predicted class confidences, sized [N,8732,21].
32 | 		'''
33 | 		y_locs = []
34 | 		y_confs = []
35 | 		for i, x in enumerate(xs):
36 | 			y_loc = self.loc_layers[i](x)
37 | 			N = y_loc.size(0)
38 | 			y_loc = y_loc.permute(0,2,3,1).contiguous()
39 | 			y_loc = y_loc.view(N,-1,4)
40 | 			y_locs.append(y_loc)
41 | 
42 | 			y_conf = self.conf_layers[i](x)
43 | 			y_conf = y_conf.permute(0,2,3,1).contiguous()
44 | 			y_conf = y_conf.view(N,-1,13)
45 | 			y_confs.append(y_conf)
46 | 
47 | 		loc_preds = torch.cat(y_locs, 1)
48 | 		conf_preds = torch.cat(y_confs, 1)
49 | 		return loc_preds, conf_preds
50 | 


--------------------------------------------------------------------------------
/MDSSD_300/scripts/multibox_loss.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import math
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.init as init
  8 | import torch.nn.functional as F
  9 | 
 10 | 
 11 | class MultiBoxLoss(nn.Module):
 12 |     num_classes = 13
 13 | 
 14 |     def __init__(self):
 15 |         super(MultiBoxLoss, self).__init__()
 16 | 
 17 |     def cross_entropy_loss(self, x, y):
 18 |         '''Cross entropy loss w/o averaging across all samples.
 19 | 
 20 |         Args:
 21 |           x: (tensor) sized [N,D].
 22 |           y: (tensor) sized [N,].
 23 | 
 24 |         Return:
 25 |           (tensor) cross entroy loss, sized [N,].
 26 |         '''
 27 |         xmax = x.data.max()
 28 |         print('x y size {} {}'.format(x.size(), y.size()))
 29 |         log_sum_exp = torch.log(torch.sum(torch.exp(x-xmax), 1)) + xmax
 30 |         print('log_sum_exp {}'.format(log_sum_exp.size()))
 31 |         return log_sum_exp - x.gather(1, y.view(-1,1))
 32 | 
 33 |     def hard_negative_mining(self, conf_loss, pos):
 34 |         '''Return negative indices that is 3x the number as postive indices.
 35 | 
 36 |         Args:
 37 |           conf_loss: (tensor) cross entroy loss between conf_preds and conf_targets, sized [N*8732,].
 38 |           pos: (tensor) positive(matched) box indices, sized [N,8732].
 39 | 
 40 |         Return:
 41 |           (tensor) negative indices, sized [N,8732].
 42 |         '''
 43 |         batch_size, num_boxes = pos.size()
 44 |         conf_loss[pos.view(-1)] = 0  # set pos boxes = 0, the rest are neg conf_loss
 45 |         conf_loss = conf_loss.view(batch_size, -1)  # [N,8732]
 46 | 
 47 |         _,idx = conf_loss.sort(1, descending=True)  # sort by neg conf_loss
 48 |         _,rank = idx.sort(1)  # [N,8732]
 49 | 
 50 |         num_pos = pos.long().sum(1)  # [N,1]
 51 |         num_neg = torch.clamp(3*num_pos, max=num_boxes-1)  # [N,1]
 52 | 
 53 |         neg = rank < num_neg.unsqueeze(1).expand_as(rank)
 54 | 
 55 |         return neg
 56 | 
 57 |     def forward(self, loc_preds, loc_targets, conf_preds, conf_targets):
 58 |         '''Compute loss between (loc_preds, loc_targets) and (conf_preds, conf_targets).
 59 | 
 60 |         Args:
 61 |           loc_preds: (tensor) predicted locations, sized [batch_size, 8732, 4].
 62 |           loc_targets: (tensor) encoded target locations, sized [batch_size, 8732, 4].
 63 |           conf_preds: (tensor) predicted class confidences, sized [batch_size, 8732, num_classes].
 64 |           conf_targets: (tensor) encoded target classes, sized [batch_size, 8732].
 65 | 
 66 |         loss:
 67 |           (tensor) loss = SmoothL1Loss(loc_preds, loc_targets) + CrossEntropyLoss(conf_preds, conf_targets).
 68 |         '''
 69 | 
 70 |         # loc_preds = loc_preds[:,:8732,:]
 71 |         # conf_preds = conf_preds[:,:8732,:]
 72 | 
 73 |         batch_size, num_boxes, _ = loc_preds.size()
 74 |         pos = conf_targets > 0  # [N,8732], pos means the box matched.
 75 |         # print(pos.size())
 76 |         num_matched_boxes = pos.data.float().sum()
 77 |         if num_matched_boxes == 0:
 78 |             return torch.tensor([0.], requires_grad=True)
 79 |         
 80 |         ################################################################
 81 |         # loc_loss = SmoothL1Loss(pos_loc_preds, pos_loc_targets)
 82 |         ################################################################
 83 |         pos_mask = pos.unsqueeze(2).expand_as(loc_preds)    # [N,8732,4]
 84 |         pos_loc_preds = loc_preds[pos_mask].view(-1,4)      # [#pos,4]
 85 |         pos_loc_targets = loc_targets[pos_mask].view(-1,4)  # [#pos,4]
 86 |         
 87 |         loc_loss = F.smooth_l1_loss(pos_loc_preds, pos_loc_targets, size_average=False)
 88 | 
 89 |         ################################################################
 90 |         # conf_loss = CrossEntropyLoss(pos_conf_preds, pos_conf_targets)
 91 |         #           + CrossEntropyLoss(neg_conf_preds, neg_conf_targets)
 92 |         ################################################################
 93 |         conf_preds = conf_preds.contiguous()
 94 |         # print(conf_preds.size(), conf_targets.size())
 95 |         conf_loss = F.cross_entropy(conf_preds.view(-1,self.num_classes), \
 96 |                                             conf_targets.view(-1), reduce=False)  # [N*8732,]
 97 |         neg = self.hard_negative_mining(conf_loss, pos)    # [N,8732]
 98 | 
 99 |         pos_mask = pos.unsqueeze(2).expand_as(conf_preds)  # [N,8732,21]
100 |         neg_mask = neg.unsqueeze(2).expand_as(conf_preds)  # [N,8732,21]
101 |         mask = (pos_mask+neg_mask).gt(0)
102 | 
103 |         pos_and_neg = (pos+neg).gt(0)
104 |         preds = conf_preds[mask].view(-1,self.num_classes)  # [#pos+#neg,21]
105 |         targets = conf_targets[pos_and_neg]                 # [#pos+#neg,]
106 |         conf_loss = F.cross_entropy(preds, targets, size_average=False)
107 | 
108 |         loc_loss /= num_matched_boxes
109 |         conf_loss /= num_matched_boxes
110 | 
111 |         return loc_loss + conf_loss
112 | 


--------------------------------------------------------------------------------
/MDSSD_300/scripts/norm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import torch.nn.init as init
 5 | 
 6 | class L2Norm(nn.Module):
 7 |     '''L2Norm layer across all channels and scale.'''
 8 |     def __init__(self, in_features,scale):
 9 |         super(L2Norm, self).__init__()
10 |         self.weight = nn.Parameter(torch.Tensor(in_features))
11 |         self.reset_parameters(scale)
12 | 
13 |     def reset_parameters(self, scale):
14 |         nn.init.constant_(self.weight, scale)
15 | 
16 |     def forward(self, x):
17 |         x = F.normalize(x, dim=1)
18 |         scale = self.weight[None,:,None,None]
19 |         return scale * x


--------------------------------------------------------------------------------
/MDSSD_300/scripts/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchvision
 3 | import torch.nn.functional as F
 4 | import torchvision.transforms as transforms
 5 | 
 6 | import sys
 7 | from mdssd import MDSSD300
 8 | from encoder import DataEncoder
 9 | import cv2
10 | 
11 | VOC_LABELS = (
12 |     'ignored regions',
13 |     'pedestrian',
14 |     'people',
15 |     'bicycle',
16 |     'car',
17 |     'van',
18 |     'truck',
19 |     'tricycle',
20 |     'awning-tricycle',
21 |     'bus',
22 |     'motor',
23 |     'other'
24 | )
25 | 
26 | 
27 | # Load model
28 | net = MDSSD300()
29 | checkpoint = torch.load('./checkpoint/ckpt.pth')
30 | 
31 | keys = []
32 | for k,v in checkpoint['net'].items():
33 |     if "module" in k:
34 |         keys.append(k)
35 | for i in keys:
36 |     checkpoint['net'][i[7:]] = checkpoint['net'][i]
37 |     del checkpoint['net'][i]
38 | 
39 | net.load_state_dict(checkpoint['net'])
40 | net.eval()
41 | 
42 | if len(sys.argv) == 2:
43 |     img_path = sys.argv[1]
44 | else:
45 |     img_path = './images/img1.jpg'
46 | 
47 | # Load test image
48 | img = cv2.imread(img_path)
49 | img1 = cv2.resize(img, (300, 300))
50 | transform = transforms.Compose([transforms.ToTensor(),
51 |                                 transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))])
52 | img1 = transform(img1)
53 | 
54 | # Forward
55 | with torch.no_grad():
56 |     x = torch.tensor(img1)
57 |     loc_preds, conf = net(x.unsqueeze(0))
58 | # Decode
59 | data_encoder = DataEncoder()
60 | boxes, labels, scores = data_encoder.decode(loc_preds.data.squeeze(0), F.softmax(conf.squeeze(0), dim=1).data)
61 | for box, label, score in zip(boxes, labels, scores):
62 |     box[::2] *= img.shape[1]
63 |     box[1::2] *= img.shape[0]
64 |     for b, s in zip(box, score):
65 |         if s > 0.25:
66 |             print('label:',VOC_LABELS[int(label[0])], 'score:', score)
67 |             b = list(b)
68 |             cv2.rectangle(img, (b[0], b[1]), (b[2], b[3]), (255, 255, 255), 2)
69 |             title = '{}: {}'.format(VOC_LABELS[int(label[0])], round(float(score[0]), 2))
70 |             cv2.putText(img, title, (b[0], b[1]), cv2.FONT_ITALIC, 0.6, (0, 255, 0), 2)
71 |             cv2.imshow('img', img)
72 | cv2.waitKey(0)


--------------------------------------------------------------------------------
/MDSSD_300/scripts/train.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import os
  4 | import argparse
  5 | import itertools
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.optim as optim
 10 | import torch.nn.functional as F
 11 | import torch.backends.cudnn as cudnn
 12 | 
 13 | import torchvision
 14 | import torchvision.transforms as transforms
 15 | 
 16 | import numpy as np
 17 | from mdssd import MDSSD300
 18 | from datagen import ListDataset
 19 | from multibox_loss import MultiBoxLoss
 20 | 
 21 | lr = 0.001
 22 | resume = False  # Resume from checkpoint
 23 | epoch = 200
 24 | batch_size = 8
 25 | 
 26 | use_cuda = torch.cuda.is_available()
 27 | best_loss = float('inf')  # best test loss
 28 | start_epoch = 0  # start from epoch 0 or last epoch
 29 | 
 30 | # Data
 31 | print('==> Preparing data..')
 32 | transform = transforms.Compose([transforms.ToTensor(),
 33 |                             transforms.Normalize(mean=(0.356, 0.368, 0.362), std=(0.242, 0.235, 0.236))])
 34 | 
 35 | trainset = ListDataset(root='../../../../VisDrone2019/dataset/VisDrone2018-DET-train/images/', list_file='../../../../VisDrone2019/dataset/VisDrone2018-DET-train/annotations/', train=True, transform=transform)
 36 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=8, shuffle=True, num_workers=4)
 37 | 
 38 | valset = ListDataset(root='../../../../VisDrone2019/dataset/VisDrone2019-DET-val/images/', list_file='../../../../VisDrone2019/dataset/VisDrone2019-DET-val/annotations/', train=True, transform=transform)
 39 | valloader = torch.utils.data.DataLoader(valset, batch_size=8, shuffle=True, num_workers=4)
 40 | 
 41 | # Model
 42 | net = MDSSD300()
 43 | if resume:
 44 |     print('==> Resuming from checkpoint..')
 45 |     checkpoint = torch.load('./checkpoint/ckpt.pth')
 46 | 
 47 |     keys = []
 48 |     for k,v in checkpoint['net'].items():
 49 |         if "module" in k:
 50 |             keys.append(k)
 51 |     for i in keys:
 52 |         checkpoint['net'][i[7:]] = checkpoint['net'][i]
 53 |         del checkpoint['net'][i]
 54 | 
 55 |     net.load_state_dict(checkpoint['net'])
 56 |     best_loss = checkpoint['loss']
 57 |     start_epoch = checkpoint['epoch']
 58 | else:
 59 |     # Convert from pretrained VGG model.
 60 |     try:
 61 |         net.load_state_dict(torch.load('../model/ssd.pth'))
 62 |         print('==> Pretrain model read successfully')
 63 |     except:
 64 |         print('==> Pretrain model read failed or not existed, training from init')
 65 | 
 66 | criterion = MultiBoxLoss()
 67 | 
 68 | if use_cuda:
 69 |     net = torch.nn.DataParallel(net, device_ids=[0])
 70 |     net.cuda()
 71 |     cudnn.benchmark = True
 72 | 
 73 | optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4)
 74 | # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.25)
 75 | 
 76 | # Training
 77 | def train(epoch,prev_val_loss, last_saved):
 78 |     print('\nEpoch: %d' % epoch)
 79 |     net.train()
 80 |     train_loss = 0
 81 |     for batch_idx, (images, loc_targets, conf_targets) in enumerate(trainloader):
 82 |         if use_cuda:
 83 |             images = images.cuda()
 84 |             loc_targets = loc_targets.cuda()
 85 |             conf_targets = conf_targets.cuda()
 86 | 
 87 |         images = torch.tensor(images)
 88 |         loc_targets = torch.tensor(loc_targets)
 89 |         conf_targets = torch.tensor(conf_targets)
 90 | 
 91 |         optimizer.zero_grad()
 92 |         loc_preds, conf_preds = net(images)
 93 |         loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets)
 94 |         loss.backward()
 95 |         optimizer.step()
 96 |         # scheduler.step()
 97 | 
 98 |         train_loss += loss.item()
 99 |         if batch_idx%100 == 0:
100 |             val_loss_tot = 0
101 |             for batch_idx_val, (images, loc_targets, conf_targets) in enumerate(valloader):
102 |                 if use_cuda:
103 |                     images = images.cuda()
104 |                     loc_targets = loc_targets.cuda()
105 |                     conf_targets = conf_targets.cuda()
106 | 
107 |                 images = torch.tensor(images)
108 |                 loc_targets = torch.tensor(loc_targets)
109 |                 conf_targets = torch.tensor(conf_targets)
110 | 
111 |                 loc_preds, conf_preds = net(images)
112 |                 val_loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets)
113 |                 val_loss_tot += val_loss.item()
114 | 
115 |             val_loss_tot /= (batch_idx_val+1)
116 |             if val_loss_tot < prev_val_loss:
117 |                 os.makedirs('checkpoint', exist_ok=True)
118 |                 torch.save({
119 |                     'epoch': epoch,
120 |                     'net': net.state_dict(), 
121 |                     'loss': loss,
122 |                 }, 'checkpoint/ckpt.pth')
123 |                 print("Saved.")
124 |                 prev_val_loss = val_loss_tot
125 |                 last_saved = [epoch, batch_idx]
126 | 
127 |         print('epoch: {}, batch_idx: {},loss: {}, train_loss: {}, best_val_loss: {}, last_saved: {}'.format(epoch, batch_idx, loss.item(), train_loss/(batch_idx+1), prev_val_loss, last_saved))
128 | 
129 |     return prev_val_loss, last_saved
130 | 
131 | 
132 | prev_val_loss = 999
133 | last_saved = [start_epoch,0]
134 | for epoch_num in range(start_epoch, start_epoch+epoch):
135 |     prev_val_loss, last_saved = train(epoch_num, prev_val_loss, last_saved)
136 | 


--------------------------------------------------------------------------------
/MDSSD_512/scripts/datagen.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Load image/class/box from a annotation file.
  3 | 
  4 | '''
  5 | from __future__ import print_function
  6 | 
  7 | import os
  8 | import sys
  9 | import os.path
 10 | 
 11 | import random
 12 | import numpy as np
 13 | 
 14 | import torch
 15 | import torch.utils.data as data
 16 | import torchvision.transforms as transforms
 17 | 
 18 | from encoder import DataEncoder
 19 | import cv2
 20 | 
 21 | import pandas as pd
 22 | import shutil
 23 | import os
 24 | import numpy as np
 25 | import glob
 26 | import xml.etree.ElementTree as ET
 27 | from xml.dom import minidom
 28 | 
 29 | class ListDataset(data.Dataset):
 30 |     img_size = 512
 31 | 
 32 |     def __init__(self, root, list_file, train, transform):
 33 |         '''
 34 |         Args:
 35 |           root: (str) ditectory to images.
 36 |           list_file: (str) path to annotation files.
 37 |           train: (boolean) train or test.
 38 |           transform: ([transforms]) image transforms.
 39 |         '''
 40 |         self.root = root
 41 |         self.train = train
 42 |         self.transform = transform
 43 | 
 44 |         self.fnames = []
 45 |         self.boxes = []
 46 |         self.labels = []
 47 | 
 48 |         self.data_encoder = DataEncoder()
 49 |         self.num_samples = 0
 50 | 
 51 |         # VisDrone
 52 | 
 53 |         for i in os.listdir(list_file):
 54 |             self.num_samples += 1
 55 |             self.fnames.append(i)
 56 |             box = []
 57 |             labels = []
 58 |             with open(os.path.join(list_file,i)) as f:
 59 |                 f = f.read().split("\n")
 60 |                 f = f[:-1]
 61 |             num_objs = len(f)
 62 | 
 63 |             for j in range(num_objs):
 64 |                 f[j] = f[j].split(",")
 65 |                 xmin = float(f[j][0])
 66 |                 ymin = float(f[j][1])
 67 |                 w = float(f[j][2])
 68 |                 h = float(f[j][3])
 69 | 
 70 |                 box.append([xmin,ymin,xmin+w,ymin+h])
 71 |                 labels.append(int(f[j][5]))
 72 |         
 73 |             self.boxes.append(torch.Tensor(box))
 74 |             self.labels.append(torch.LongTensor(labels))
 75 |        
 76 | 
 77 |     def __getitem__(self, idx):
 78 |         '''Load a image, and encode its bbox locations and class labels.
 79 |         Args:
 80 |           idx: (int) image index.
 81 |         Returns:
 82 |           img: (tensor) image tensor.
 83 |           loc_target: (tensor) location targets, sized [8732,4].
 84 |           conf_target: (tensor) label targets, sized [8732,].
 85 |         '''
 86 |         # Load image and bbox locations.
 87 |         fname = self.fnames[idx]
 88 |         img = cv2.imread(os.path.join(self.root, fname[:-4]+".jpg"))
 89 |         boxes = self.boxes[idx].clone()
 90 |         labels = self.labels[idx]
 91 | 
 92 |         # Data augmentation while training.
 93 |         if self.train:
 94 |             img, boxes = self.random_flip(img, boxes)
 95 |             img, boxes, labels = self.random_crop(img, boxes, labels)
 96 | 
 97 |         # Scale bbox locaitons to [0,1].
 98 |         w,h = img.shape[1], img.shape[0]
 99 |         boxes /= torch.Tensor([w,h,w,h]).expand_as(boxes)
100 |         img = cv2.resize(img, (self.img_size,self.img_size))
101 |         img = self.transform(img)
102 | 
103 |         # Encode loc & conf targets.
104 |         
105 |         loc_target, conf_target = self.data_encoder.encode(boxes, labels)
106 |         return img, loc_target, conf_target
107 | 
108 |     def random_flip(self, img, boxes):
109 |         '''Randomly flip the image and adjust the bbox locations.
110 |         For bbox (xmin, ymin, xmax, ymax), the flipped bbox is:
111 |         (w-xmax, ymin, w-xmin, ymax).
112 |         Args:
113 |           img: (ndarray.Image) image. f
114 |           boxes: (tensor) bbox locations, sized [#obj, 4].
115 |         Returns:
116 |           img: (ndarray.Image) randomly flipped image.
117 |           boxes: (tensor) randomly flipped bbox locations, sized [#obj, 4].
118 |         '''
119 |         if random.random() < 0.5:
120 |             img = cv2.flip(img, 1)
121 |             w = img.shape[1]
122 |             xmin = w - boxes[:,2]
123 |             xmax = w - boxes[:,0]
124 |             boxes[:,0] = xmin
125 |             boxes[:,2] = xmax
126 |         return img, boxes
127 | 
128 |     def random_crop(self, img, boxes, labels):
129 |         '''Randomly crop the image and adjust the bbox locations.
130 |         For more details, see 'Chapter2.2: Data augmentation' of the paper.
131 |         Args:
132 |           img: (ndarray.Image) image.
133 |           boxes: (tensor) bbox locations, sized [#obj, 4].
134 |           labels: (tensor) bbox labels, sized [#obj,].
135 |         Returns:
136 |           img: (ndarray.Image) cropped image.
137 |           selected_boxes: (tensor) selected bbox locations.
138 |           labels: (tensor) selected bbox labels.
139 |         '''
140 |         imw, imh = img.shape[1], img.shape[0]
141 |         while True:
142 |             min_iou = random.choice([None, 0.1, 0.3, 0.5, 0.7, 0.9])# random choice the one 
143 |             if min_iou is None:
144 |                 return img, boxes, labels
145 | 
146 |             for _ in range(100):
147 |                 w = random.randrange(int(0.1*imw), imw)
148 |                 h = random.randrange(int(0.1*imh), imh)
149 | 
150 |                 if h > 2*w or w > 2*h or h < 1 or w < 1:
151 |                     continue
152 | 
153 |                 x = random.randrange(imw - w)
154 |                 y = random.randrange(imh - h)
155 |                 roi = torch.Tensor([[x, y, x+w, y+h]])
156 |                 
157 |                 center = (boxes[:,:2] + boxes[:,2:]) / 2  # [N,2]
158 |                 roi2 = roi.expand(len(center), 4)  # [N,4]
159 |     
160 |                 mask = (center > roi2[:,:2]) & (center < roi2[:,2:])  # [N,2]
161 |                 mask = mask[:,0] & mask[:,1]  #[N,]
162 | 
163 |                 if not mask.any():
164 |                     continue
165 |               
166 |                 selected_boxes = boxes.index_select(0, mask.nonzero().squeeze(1))
167 |                 
168 |                 iou = self.data_encoder.iou(selected_boxes, roi)
169 |                 if iou.min() < min_iou:
170 |                     continue
171 |                 img = img[y:y+h, x:x+w, :]
172 |                 
173 |                 selected_boxes[:,0].add_(-x).clamp_(min=0, max=w)
174 |                 selected_boxes[:,1].add_(-y).clamp_(min=0, max=h)
175 |                 selected_boxes[:,2].add_(-x).clamp_(min=0, max=w)
176 |                 selected_boxes[:,3].add_(-y).clamp_(min=0, max=h)
177 | 
178 |                 return img, selected_boxes, labels[mask]
179 | 
180 |     def __len__(self):
181 |         return self.num_samples


--------------------------------------------------------------------------------
/MDSSD_512/scripts/encoder.py:
--------------------------------------------------------------------------------
  1 | '''Encode target locations and labels.'''
  2 | import torch
  3 | 
  4 | import math
  5 | import itertools
  6 | 
  7 | class DataEncoder:
  8 |     def __init__(self):
  9 |         '''Compute default box sizes with scale and aspect transform.'''
 10 |         scale = 512.
 11 |         steps = [s / scale for s in (32, 64, 128, 256, 512, 4, 8, 16)]
 12 |         sizes_ssd = [s / scale for s in (115.0, 230.4, 307.2, 384.0, 460.8, 537.6)]
 13 |         sizes_fusion = [s / scale for s in (35.84, 76.8, 153.6, 230.4)]
 14 |         aspect_ratios = ((2,3), (2,3), (2,), (2,), (2,), (2,), (2,),(2,))
 15 |         feature_map_sizes = (16,8,4,2,1,128,64,32)
 16 |         num_layers = len(feature_map_sizes)
 17 | 
 18 |         boxes = []
 19 |         for i in range(num_layers):
 20 |             fmsize = feature_map_sizes[i] # feature map size 
 21 |             for h,w in itertools.product(range(fmsize), repeat=2):
 22 |                 cx = (w + 0.5)*steps[i]
 23 |                 cy = (h + 0.5)*steps[i]
 24 | 
 25 |                 if i<5:
 26 |                     s = sizes_ssd[i]
 27 |                     boxes.append((cx, cy, s, s))
 28 | 
 29 |                     s = math.sqrt(sizes_ssd[i] * sizes_ssd[i+1])
 30 |                     boxes.append((cx, cy, s, s))
 31 | 
 32 |                     s = sizes_ssd[i]
 33 |                     for ar in aspect_ratios[i]:
 34 |                         boxes.append((cx, cy, s * math.sqrt(ar), s / math.sqrt(ar)))
 35 |                         boxes.append((cx, cy, s / math.sqrt(ar), s * math.sqrt(ar)))
 36 | 
 37 |                 else:            
 38 |                     s = sizes_fusion[i-5]
 39 |                     boxes.append((cx, cy, s, s))
 40 | 
 41 |                     s = math.sqrt(sizes_fusion[i-5] * sizes_fusion[i-5+1])
 42 |                     boxes.append((cx, cy, s, s))
 43 | 
 44 |                     s = sizes_fusion[i-5]
 45 |                     for ar in aspect_ratios[i]:
 46 |                         boxes.append((cx, cy, s * math.sqrt(ar), s / math.sqrt(ar)))
 47 |                         boxes.append((cx, cy, s / math.sqrt(ar), s * math.sqrt(ar)))
 48 | 
 49 |         self.default_boxes = torch.Tensor(boxes)
 50 | 
 51 |     def iou(self, box1, box2):
 52 |         '''Compute the intersection over union of two set of boxes, each box is [x1,y1,x2,y2].
 53 | 
 54 |         Args:
 55 |           box1: (tensor) bounding boxes, sized [N,4].
 56 |           box2: (tensor) bounding boxes, sized [M,4].
 57 | 
 58 |         Return:
 59 |           (tensor) iou, sized [N,M].
 60 |         '''
 61 |         N = box1.size(0)
 62 |         M = box2.size(0)
 63 | 
 64 |         lt = torch.max(
 65 |             box1[:,:2].unsqueeze(1).expand(N,M,2),  # [N,2] -> [N,1,2] -> [N,M,2]
 66 |             box2[:,:2].unsqueeze(0).expand(N,M,2),  # [M,2] -> [1,M,2] -> [N,M,2]
 67 |         )
 68 | 
 69 |         rb = torch.min(
 70 |             box1[:,2:].unsqueeze(1).expand(N,M,2),  # [N,2] -> [N,1,2] -> [N,M,2]
 71 |             box2[:,2:].unsqueeze(0).expand(N,M,2),  # [M,2] -> [1,M,2] -> [N,M,2]
 72 |         )
 73 | 
 74 |         wh = rb - lt  # [N,M,2]
 75 |         wh[wh<0] = 0  # clip at 0
 76 |         inter = wh[:,:,0] * wh[:,:,1]  # [N,M]
 77 | 
 78 |         area1 = (box1[:,2]-box1[:,0]) * (box1[:,3]-box1[:,1])  # [N,]
 79 |         area2 = (box2[:,2]-box2[:,0]) * (box2[:,3]-box2[:,1])  # [M,]
 80 |         area1 = area1.unsqueeze(1).expand_as(inter)  # [N,] -> [N,1] -> [N,M]
 81 |         area2 = area2.unsqueeze(0).expand_as(inter)  # [M,] -> [1,M] -> [N,M]
 82 | 
 83 |         iou = inter / (area1 + area2 - inter)
 84 |         return iou
 85 | 
 86 |     def encode(self, boxes, classes, threshold=0.5):
 87 |         '''Transform target bounding boxes and class labels to SSD boxes and classes.
 88 | 
 89 |         Match each object box to all the default boxes, pick the ones with the
 90 |         Jaccard-Index > 0.5:
 91 |             Jaccard(A,B) = AB / (A+B-AB)
 92 | 
 93 |         Args:
 94 |           boxes: (tensor) object bounding boxes (xmin,ymin,xmax,ymax) of a image, sized [#obj, 4].
 95 |           classes: (tensor) object class labels of a image, sized [#obj,].
 96 |           threshold: (float) Jaccard index threshold
 97 | 
 98 |         Returns:
 99 |           boxes: (tensor) bounding boxes, sized [#obj, 8732, 4].
100 |           classes: (tensor) class labels, sized [8732,]
101 |         '''
102 |         default_boxes = self.default_boxes
103 |         num_default_boxes = default_boxes.size(0)
104 |         num_objs = boxes.size(0)
105 | 
106 |         iou = self.iou(  # [#obj,8732]
107 |             boxes,
108 |             torch.cat([default_boxes[:,:2] - default_boxes[:,2:]/2,
109 |                        default_boxes[:,:2] + default_boxes[:,2:]/2], 1)
110 |         )
111 | 
112 |         iou, max_idx = iou.max(0)  # [1,8732]
113 |         max_idx.squeeze_(0)        # [8732,]
114 |         iou.squeeze_(0)            # [8732,]
115 | 
116 |         boxes = boxes[max_idx]     # [8732,4]
117 |         variances = [0.1, 0.2]
118 |         cxcy = (boxes[:,:2] + boxes[:,2:])/2 - default_boxes[:,:2]  # [8732,2]
119 |         cxcy /= variances[0] * default_boxes[:,2:]
120 |         wh = (boxes[:,2:] - boxes[:,:2]) / default_boxes[:,2:]      # [8732,2]
121 |         wh = torch.log(wh) / variances[1]
122 |         loc = torch.cat([cxcy, wh], 1)  # [8732,4]
123 | 
124 |         conf = 1 + classes[max_idx]   # [8732,], background class = 0
125 |         conf[iou<threshold] = 0       # background
126 |         return loc, conf
127 | 
128 |     def nms(self, bboxes, scores, threshold=0.3, mode='union'):
129 |         '''Non maximum suppression.
130 |         Args:
131 |           bboxes: (tensor) bounding boxes, sized [N,4].
132 |           scores: (tensor) bbox scores, sized [N,].
133 |           threshold: (float) overlap threshold.
134 |           mode: (str) 'union' or 'min'.
135 | 
136 |         Returns:
137 |           keep: (tensor) selected indices.
138 | 
139 |         Ref:
140 |           https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py
141 |         '''
142 |         x1 = bboxes[:,0]
143 |         y1 = bboxes[:,1]
144 |         x2 = bboxes[:,2]
145 |         y2 = bboxes[:,3]
146 | 
147 |         areas = (x2-x1) * (y2-y1)
148 |         _, order = scores.sort(0, descending=True)
149 |         order = order.numpy()
150 | 
151 |         keep = []
152 |         while order.size > 0: 
153 |             try:
154 |                 i = order[0]
155 |             except:
156 |                 i = order
157 |             keep.append(i)
158 | 
159 |             if order.size == 1:
160 |                 break
161 | 
162 |             xx1 = x1[order[1:]].clamp(min=x1[i])
163 |             yy1 = y1[order[1:]].clamp(min=y1[i])
164 |             xx2 = x2[order[1:]].clamp(max=x2[i])
165 |             yy2 = y2[order[1:]].clamp(max=y2[i])
166 | 
167 |             w = (xx2-xx1).clamp(min=0)
168 |             h = (yy2-yy1).clamp(min=0)
169 |             inter = w*h
170 | 
171 |             if mode == 'union':
172 |                 ovr = inter / (areas[i] + areas[order[1:]] - inter)
173 |             elif mode == 'min':
174 |                 ovr = inter / areas[order[1:]].clamp(max=areas[i])
175 |             else:
176 |                 raise TypeError('Unknown nms mode: %s.' % mode)
177 | 
178 |             ids = (ovr<=threshold).nonzero().squeeze()
179 |             if ids.size == 0:
180 |                 break
181 |             order = order[ids+1]
182 |         return torch.LongTensor(keep)
183 | 
184 |     def decode(self, loc, conf):
185 |         '''Transform predicted loc/conf back to real bbox locations and class labels.
186 | 
187 |         Args:
188 |           loc: (tensor) predicted loc, sized [8732,4].
189 |           conf: (tensor) predicted conf, sized [8732,21].
190 | 
191 |         Returns:
192 |           boxes: (tensor) bbox locations, sized [#obj, 4].
193 |           labels: (tensor) class labels, sized [#obj,1].
194 |         '''
195 |         variances = (0.1, 0.2)
196 |         wh = torch.exp(loc[:,2:]*variances[1]) * self.default_boxes[:,2:]
197 |         cxcy = loc[:,:2] * variances[0] * self.default_boxes[:,2:] + self.default_boxes[:,:2]
198 |         box_preds = torch.cat([cxcy-wh/2, cxcy+wh/2], 1)  # [8732,4]
199 | 
200 |         boxes = []
201 |         labels = []
202 |         scores = []
203 |         num_classes = conf.size(1)
204 |         for i in range(num_classes-1):
205 |             score = conf[:,i+1]  # class i corresponds to (i+1) column
206 |             mask = score > 0.1
207 |             
208 |             if not mask.any():
209 |                 continue
210 |            
211 |             box = box_preds[mask.nonzero().squeeze()]
212 |             score = score[mask]
213 |             
214 |             if len(score) == 1:
215 |                 continue
216 |             keep = self.nms(box, score, threshold=0.3)
217 |             boxes.append(box[keep])
218 |             labels.append(torch.LongTensor(len(box[keep])).fill_(i))
219 |             scores.append(score[keep])
220 | 
221 |         return boxes, labels, scores
222 | 


--------------------------------------------------------------------------------
/MDSSD_512/scripts/fusion.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import torch.nn.init as init
 5 | 
 6 | from torch.autograd import Variable
 7 | 
 8 | from norm import L2Norm
 9 | 
10 | class FusionBlock(nn.Module):
11 |     def __init__(self, big_features, small_features):
12 |         super(FusionBlock, self).__init__()
13 |         
14 |         # Bigger feature map
15 |         self.conv1_1 = nn.Conv2d(big_features, 256, kernel_size=3, padding=1, dilation=1)
16 |         self.Norm1 = L2Norm(256, 20)
17 | 
18 |         # Smaller feature map
19 |         self.deconv2_1 = nn.ConvTranspose2d(small_features, 256, 2, stride=2, dilation=1)
20 |         self.conv2_1 = nn.Conv2d(256, 256, kernel_size=3, padding=1, dilation=1)
21 |         self.deconv2_2 = nn.ConvTranspose2d(256, 256, 2, stride=2, dilation=1)
22 |         self.conv2_2 = nn.Conv2d(256, 256, kernel_size=3, padding=1, dilation=1)
23 |         self.deconv2_3 = nn.ConvTranspose2d(256, 256, 3, stride=2, dilation=1)
24 |         self.conv2_3 = nn.Conv2d(256, 256, kernel_size=3, padding=1, dilation=1)
25 |         self.Norm2 = L2Norm(256, 20)
26 | 
27 |         # Common
28 |         self.conv3_1 = nn.Conv2d(256, big_features, kernel_size=3, padding=1, dilation=1)
29 | 
30 |         
31 |     def forward(self, big, small):
32 |         h1 = self.conv1_1(big)
33 |         h1 = self.Norm1(h1)
34 | 
35 |         h2 = self.deconv2_1(small)
36 |         # print(h2.size())
37 |         h2 = F.relu(self.conv2_1(h2))
38 |         # print(h2.size())
39 |         h2 = self.deconv2_2(h2)
40 |         # print(h2.size())
41 |         h2 = F.relu(self.conv2_2(h2))
42 |         # print(h2.size())
43 |         h2 = self.deconv2_3(h2)
44 |         # print(h2.size())
45 |         h2 = self.conv2_3(h2)
46 |         # print(h2.size())
47 |         h2 = self.Norm2(h2)
48 | 
49 |         size = h2.size()[3]
50 |         diff_odd = h2.size()[-1] - h1.size()[-1]
51 |         h2 = h2[:,:,(int(diff_odd/2)+diff_odd%2):(size-int(diff_odd/2)),(int(diff_odd/2)+diff_odd%2):(size-int(diff_odd/2))]
52 | 
53 |         # print(h1.size(), h2.size())
54 |         h = F.relu(h1+h2)
55 |         h = F.relu(self.conv3_1(h))
56 | 
57 |         return h 
58 | 
59 | if __name__ == '__main__':
60 |     big = torch.randn(1, 256, 128, 128)
61 |     small = torch.rand(1,512,16,16)
62 |     net = FusionBlock(256,512)
63 | 
64 |                
65 |         
66 | 


--------------------------------------------------------------------------------
/MDSSD_512/scripts/mdssd.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import itertools
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | import torch.nn.init as init
  8 | 
  9 | from torch.autograd import Variable
 10 | 
 11 | from multibox_layer import MultiBoxLayer
 12 | from fusion import FusionBlock
 13 | from norm import L2Norm
 14 | 
 15 | class MDSSD300(nn.Module):
 16 |     input_size = 512
 17 | 
 18 |     def __init__(self):
 19 |         super(MDSSD300, self).__init__()
 20 | 		
 21 | 		# model
 22 |         self.base = self.VGG16()
 23 |         self.norm4 = L2Norm(512, 20) # 64
 24 | 
 25 |         self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1)
 26 |         self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1)
 27 |         self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1)
 28 | 
 29 |         self.conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
 30 | 
 31 |         self.conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 
 32 | 
 33 |         self.conv8_1 = nn.Conv2d(1024, 256, kernel_size=1)
 34 |         self.conv8_2 = nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2) #16
 35 | 
 36 |         self.conv9_1 = nn.Conv2d(512, 128, kernel_size=1)
 37 |         self.conv9_2 = nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2) #8
 38 | 
 39 |         self.conv10_1 = nn.Conv2d(256, 128, kernel_size=1)
 40 |         self.conv10_2 = nn.Conv2d(128, 256, kernel_size=3, padding=1,stride=2) #4
 41 | 
 42 |         self.conv11_1 = nn.Conv2d(256, 128, kernel_size=1)
 43 |         self.conv11_2 = nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2) #2
 44 | 
 45 |         self.conv12_1 = nn.Conv2d(256, 128, kernel_size=1)
 46 |         self.conv12_2 = nn.Conv2d(128, 256, kernel_size=2)
 47 | 
 48 |         self.Fusion1 = FusionBlock(256,512)
 49 |         self.Fusion2 = FusionBlock(512,256)
 50 |         self.Fusion3 = FusionBlock(1024,256)
 51 | 
 52 |         # multibox layer
 53 |         self.multibox = MultiBoxLayer()
 54 | 
 55 |     def forward(self, x):
 56 |         odd_count = 0
 57 |         odd = []
 58 |         hs = []
 59 |         vgg = []
 60 |         fusion_layers = []
 61 |         h = self.base[0](x)
 62 |         vgg.append(h)
 63 |         for i in range(1,len(self.base)):
 64 |             h = self.base[i](h)
 65 |             # print(h.size())
 66 |             vgg.append(h)
 67 |         fusion_layers.append(vgg[15])
 68 |         odd.append(2)
 69 |         odd_count = 3
 70 |         fusion_layers.append(h)
 71 |         h = F.max_pool2d(h, kernel_size=2, stride=2, ceil_mode=True)
 72 |         # print(h.size())
 73 | 
 74 |         h = F.relu(self.conv5_1(h))
 75 |         h = F.relu(self.conv5_2(h))
 76 |         h = F.relu(self.conv5_3(h))
 77 |         h = F.max_pool2d(h, kernel_size=3, padding=1, stride=1, ceil_mode=True)
 78 |         # print(h.size())
 79 |         
 80 |         h = F.relu(self.conv6(h))
 81 |         h = F.relu(self.conv7(h))
 82 |         # print(h.size())
 83 |         fusion_layers.append(h)
 84 | 
 85 |         h = F.relu(self.conv8_1(h))
 86 |         h = F.relu(self.conv8_2(h))
 87 |         # print(h.size())
 88 |         hs.append(h)  # conv8_2
 89 | 
 90 |         h = F.relu(self.conv9_1(h))
 91 |         h = F.relu(self.conv9_2(h))
 92 |         # print(h.size())
 93 |         hs.append(h)  # conv9_2
 94 | 
 95 |         h = F.relu(self.conv10_1(h))
 96 |         h = F.relu(self.conv10_2(h))
 97 |         # print(h.size())
 98 |         hs.append(h)  # conv10_2
 99 | 
100 |         h = F.relu(self.conv11_1(h))
101 |         h = F.relu(self.conv11_2(h))
102 |         # print(h.size())
103 |         hs.append(h)  # conv11_2
104 |         
105 |         h = F.relu(self.conv12_1(h))
106 |         h = F.relu(self.conv12_2(h))
107 |         # print(h.size())
108 |         hs.append(h)  # conv12_2
109 |         
110 |         # Fusion Blocks
111 |         f = self.Fusion1(fusion_layers[0],hs[-5])
112 |         hs.append(f)
113 |         f = self.Fusion2(fusion_layers[1],hs[-5])
114 |         hs.append(f)
115 |         f = self.Fusion3(fusion_layers[2],hs[-5])
116 |         hs.append(f)
117 | 
118 |         loc_preds, conf_preds = self.multibox(hs)
119 |  
120 |         return loc_preds, conf_preds
121 | 
122 |     def VGG16(self):
123 |         '''VGG16 layers.'''
124 |         cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512]
125 |         layers = []
126 |         in_channels = 3
127 |         for x in cfg:
128 |             if x == 'M':
129 |                 layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
130 |             else:
131 |                 layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
132 |                            nn.ReLU(True)]
133 |                 in_channels = x
134 |         return nn.Sequential(*layers)
135 | 
136 | if __name__ == '__main__':
137 |     t = torch.randn(1, 3, 300, 300)
138 |     net = MDSSD300()
139 |     # print(net)
140 |     res = net.forward(t)
141 |     


--------------------------------------------------------------------------------
/MDSSD_512/scripts/multibox_layer.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import math
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.init as init
 8 | import torch.nn.functional as F
 9 | 
10 | 
11 | class MultiBoxLayer(nn.Module):
12 | 	num_classes = 13
13 | 	num_anchors = [6,6,4,4,4,4,4,4]
14 | 	
15 | 	def __init__(self):
16 | 		super(MultiBoxLayer, self).__init__()
17 | 		self.in_planes = [512,256,256,256,256,256,512,1024]
18 | 		self.loc_layers = nn.ModuleList()
19 | 		self.conf_layers = nn.ModuleList()
20 | 		for i in range(len(self.in_planes)):
21 | 			self.loc_layers.append(nn.Conv2d(self.in_planes[i], self.num_anchors[i]*4, kernel_size=3, padding=1))
22 | 			self.conf_layers.append(nn.Conv2d(self.in_planes[i], self.num_anchors[i]*13, kernel_size=3, padding=1))
23 | 
24 | 	def forward(self, xs):
25 | 		'''
26 | 		Args:
27 | 		  xs: (list) of tensor containing intermediate layer outputs.
28 | 
29 | 		Returns:
30 | 		  loc_preds: (tensor) predicted locations, sized [N,8732,4].
31 | 		  conf_preds: (tensor) predicted class confidences, sized [N,8732,21].
32 | 		'''
33 | 		y_locs = []
34 | 		y_confs = []
35 | 		for i, x in enumerate(xs):
36 | 			y_loc = self.loc_layers[i](x)
37 | 			N = y_loc.size(0)
38 | 			y_loc = y_loc.permute(0,2,3,1).contiguous()
39 | 			y_loc = y_loc.view(N,-1,4)
40 | 			y_locs.append(y_loc)
41 | 
42 | 			y_conf = self.conf_layers[i](x)
43 | 			y_conf = y_conf.permute(0,2,3,1).contiguous()
44 | 			y_conf = y_conf.view(N,-1,13)
45 | 			y_confs.append(y_conf)
46 | 
47 | 		loc_preds = torch.cat(y_locs, 1)
48 | 		conf_preds = torch.cat(y_confs, 1)
49 | 		return loc_preds, conf_preds
50 | 


--------------------------------------------------------------------------------
/MDSSD_512/scripts/multibox_loss.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import math
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.init as init
  8 | import torch.nn.functional as F
  9 | 
 10 | 
 11 | class MultiBoxLoss(nn.Module):
 12 |     num_classes = 13
 13 | 
 14 |     def __init__(self):
 15 |         super(MultiBoxLoss, self).__init__()
 16 | 
 17 |     def cross_entropy_loss(self, x, y):
 18 |         '''Cross entropy loss w/o averaging across all samples.
 19 | 
 20 |         Args:
 21 |           x: (tensor) sized [N,D].
 22 |           y: (tensor) sized [N,].
 23 | 
 24 |         Return:
 25 |           (tensor) cross entroy loss, sized [N,].
 26 |         '''
 27 |         xmax = x.data.max()
 28 |         print('x y size {} {}'.format(x.size(), y.size()))
 29 |         log_sum_exp = torch.log(torch.sum(torch.exp(x-xmax), 1)) + xmax
 30 |         print('log_sum_exp {}'.format(log_sum_exp.size()))
 31 |         return log_sum_exp - x.gather(1, y.view(-1,1))
 32 | 
 33 |     def hard_negative_mining(self, conf_loss, pos):
 34 |         '''Return negative indices that is 3x the number as postive indices.
 35 | 
 36 |         Args:
 37 |           conf_loss: (tensor) cross entroy loss between conf_preds and conf_targets, sized [N*8732,].
 38 |           pos: (tensor) positive(matched) box indices, sized [N,8732].
 39 | 
 40 |         Return:
 41 |           (tensor) negative indices, sized [N,8732].
 42 |         '''
 43 |         batch_size, num_boxes = pos.size()
 44 |         conf_loss[pos.view(-1)] = 0  # set pos boxes = 0, the rest are neg conf_loss
 45 |         conf_loss = conf_loss.view(batch_size, -1)  # [N,8732]
 46 | 
 47 |         _,idx = conf_loss.sort(1, descending=True)  # sort by neg conf_loss
 48 |         _,rank = idx.sort(1)  # [N,8732]
 49 | 
 50 |         num_pos = pos.long().sum(1)  # [N,1]
 51 |         num_neg = torch.clamp(3*num_pos, max=num_boxes-1)  # [N,1]
 52 | 
 53 |         neg = rank < num_neg.unsqueeze(1).expand_as(rank)
 54 | 
 55 |         return neg
 56 | 
 57 |     def forward(self, loc_preds, loc_targets, conf_preds, conf_targets):
 58 |         '''Compute loss between (loc_preds, loc_targets) and (conf_preds, conf_targets).
 59 | 
 60 |         Args:
 61 |           loc_preds: (tensor) predicted locations, sized [batch_size, 8732, 4].
 62 |           loc_targets: (tensor) encoded target locations, sized [batch_size, 8732, 4].
 63 |           conf_preds: (tensor) predicted class confidences, sized [batch_size, 8732, num_classes].
 64 |           conf_targets: (tensor) encoded target classes, sized [batch_size, 8732].
 65 | 
 66 |         loss:
 67 |           (tensor) loss = SmoothL1Loss(loc_preds, loc_targets) + CrossEntropyLoss(conf_preds, conf_targets).
 68 |         '''
 69 | 
 70 |         # loc_preds = loc_preds[:,:8732,:]
 71 |         # conf_preds = conf_preds[:,:8732,:]
 72 | 
 73 |         batch_size, num_boxes, _ = loc_preds.size()
 74 |         pos = conf_targets > 0  # [N,8732], pos means the box matched.
 75 |         # print(pos.size())
 76 |         num_matched_boxes = pos.data.float().sum()
 77 |         if num_matched_boxes == 0:
 78 |             return torch.tensor([0.], requires_grad=True)
 79 |         
 80 |         ################################################################
 81 |         # loc_loss = SmoothL1Loss(pos_loc_preds, pos_loc_targets)
 82 |         ################################################################
 83 |         pos_mask = pos.unsqueeze(2).expand_as(loc_preds)    # [N,8732,4]
 84 |         pos_loc_preds = loc_preds[pos_mask].view(-1,4)      # [#pos,4]
 85 |         pos_loc_targets = loc_targets[pos_mask].view(-1,4)  # [#pos,4]
 86 |         
 87 |         loc_loss = F.smooth_l1_loss(pos_loc_preds, pos_loc_targets, size_average=False)
 88 | 
 89 |         ################################################################
 90 |         # conf_loss = CrossEntropyLoss(pos_conf_preds, pos_conf_targets)
 91 |         #           + CrossEntropyLoss(neg_conf_preds, neg_conf_targets)
 92 |         ################################################################
 93 |         conf_preds = conf_preds.contiguous()
 94 |         # print(conf_preds.size(), conf_targets.size())
 95 |         conf_loss = F.cross_entropy(conf_preds.view(-1,self.num_classes), \
 96 |                                             conf_targets.view(-1), reduce=False)  # [N*8732,]
 97 |         neg = self.hard_negative_mining(conf_loss, pos)    # [N,8732]
 98 | 
 99 |         pos_mask = pos.unsqueeze(2).expand_as(conf_preds)  # [N,8732,21]
100 |         neg_mask = neg.unsqueeze(2).expand_as(conf_preds)  # [N,8732,21]
101 |         mask = (pos_mask+neg_mask).gt(0)
102 | 
103 |         pos_and_neg = (pos+neg).gt(0)
104 |         preds = conf_preds[mask].view(-1,self.num_classes)  # [#pos+#neg,21]
105 |         targets = conf_targets[pos_and_neg]                 # [#pos+#neg,]
106 |         conf_loss = F.cross_entropy(preds, targets, size_average=False)
107 | 
108 |         loc_loss /= num_matched_boxes
109 |         conf_loss /= num_matched_boxes
110 | 
111 |         return loc_loss + conf_loss
112 | 


--------------------------------------------------------------------------------
/MDSSD_512/scripts/norm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import torch.nn.init as init
 5 | 
 6 | class L2Norm(nn.Module):
 7 |     '''L2Norm layer across all channels and scale.'''
 8 |     def __init__(self, in_features,scale):
 9 |         super(L2Norm, self).__init__()
10 |         self.weight = nn.Parameter(torch.Tensor(in_features))
11 |         self.reset_parameters(scale)
12 | 
13 |     def reset_parameters(self, scale):
14 |         nn.init.constant_(self.weight, scale)
15 | 
16 |     def forward(self, x):
17 |         x = F.normalize(x, dim=1)
18 |         scale = self.weight[None,:,None,None]
19 |         return scale * x


--------------------------------------------------------------------------------
/MDSSD_512/scripts/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchvision
 3 | import torch.nn.functional as F
 4 | import torchvision.transforms as transforms
 5 | 
 6 | import sys
 7 | from mdssd import MDSSD300
 8 | from encoder import DataEncoder
 9 | import cv2
10 | 
11 | VOC_LABELS = (
12 |     'ignored regions',
13 |     'pedestrian',
14 |     'people',
15 |     'bicycle',
16 |     'car',
17 |     'van',
18 |     'truck',
19 |     'tricycle',
20 |     'awning-tricycle',
21 |     'bus',
22 |     'motor',
23 |     'other'
24 | )
25 | 
26 | 
27 | # Load model
28 | net = MDSSD300()
29 | checkpoint = torch.load('./checkpoint/ckpt.pth')
30 | 
31 | keys = []
32 | for k,v in checkpoint['net'].items():
33 |     if "module" in k:
34 |         keys.append(k)
35 | for i in keys:
36 |     checkpoint['net'][i[7:]] = checkpoint['net'][i]
37 |     del checkpoint['net'][i]
38 | 
39 | net.load_state_dict(checkpoint['net'])
40 | net.eval()
41 | 
42 | if len(sys.argv) == 2:
43 |     img_path = sys.argv[1]
44 | else:
45 |     img_path = './images/img5.jpg'
46 | 
47 | # Load test image
48 | img = cv2.imread(img_path)
49 | img1 = cv2.resize(img, (512, 312))
50 | transform = transforms.Compose([transforms.ToTensor(),
51 |                                 transforms.Normalize(mean=(0.356, 0.368, 0.362), std=(0.242, 0.235, 0.236))])
52 | img1 = transform(img1)
53 | 
54 | # Forward
55 | with torch.no_grad():
56 |     x = torch.tensor(img1)
57 |     loc_preds, conf = net(x.unsqueeze(0))
58 | # Decode
59 | data_encoder = DataEncoder()
60 | boxes, labels, scores = data_encoder.decode(loc_preds.data.squeeze(0), F.softmax(conf.squeeze(0), dim=1).data)
61 | for box, label, score in zip(boxes, labels, scores):
62 |     for b, s in zip(box, score):
63 |         if s > 0.25:
64 |             b[::2] *= img.shape[1]
65 |             b[1::2] *= img.shape[0]
66 |             print('label:',VOC_LABELS[int(label[0])], 'score:', score)
67 |             b = list(b)
68 |             cv2.rectangle(img, (b[0], b[1]), (b[2], b[3]), (255, 255, 255), 2)
69 |             title = '{}: {}'.format(VOC_LABELS[int(label[0])], round(float(score[0]), 2))
70 |             cv2.putText(img, title, (b[0], b[1]), cv2.FONT_ITALIC, 0.6, (0, 255, 0), 2)
71 |             cv2.imshow('img', img)
72 | cv2.waitKey(0)


--------------------------------------------------------------------------------
/MDSSD_512/scripts/train.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import os
  4 | import argparse
  5 | import itertools
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.optim as optim
 10 | import torch.nn.functional as F
 11 | import torch.backends.cudnn as cudnn
 12 | 
 13 | import torchvision
 14 | import torchvision.transforms as transforms
 15 | 
 16 | import numpy as np
 17 | from mdssd import MDSSD300
 18 | from datagen import ListDataset
 19 | from multibox_loss import MultiBoxLoss
 20 | 
 21 | lr = 0.001
 22 | resume = False  # Resume from checkpoint
 23 | epoch = 200
 24 | batch_size = 8
 25 | 
 26 | use_cuda = torch.cuda.is_available()
 27 | best_loss = float('inf')  # best test loss
 28 | start_epoch = 0  # start from epoch 0 or last epoch
 29 | 
 30 | # Data
 31 | print('==> Preparing data..')
 32 | transform = transforms.Compose([transforms.ToTensor(),
 33 |                             transforms.Normalize(mean=(0.356, 0.368, 0.362), std=(0.242, 0.235, 0.236))])
 34 | 
 35 | trainset = ListDataset(root='../../../../VisDrone2019/dataset/VisDrone2018-DET-train/images/', list_file='../../../../VisDrone2019/dataset/VisDrone2018-DET-train/annotations/', train=True, transform=transform)
 36 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=8, shuffle=True, num_workers=4)
 37 | 
 38 | valset = ListDataset(root='../../../../VisDrone2019/dataset/VisDrone2019-DET-val/images/', list_file='../../../../VisDrone2019/dataset/VisDrone2019-DET-val/annotations/', train=True, transform=transform)
 39 | valloader = torch.utils.data.DataLoader(valset, batch_size=8, shuffle=True, num_workers=4)
 40 | 
 41 | # Model
 42 | net = MDSSD300()
 43 | if resume:
 44 |     print('==> Resuming from checkpoint..')
 45 |     checkpoint = torch.load('./checkpoint/ckpt.pth')
 46 | 
 47 |     keys = []
 48 |     for k,v in checkpoint['net'].items():
 49 |         if "module" in k:
 50 |             keys.append(k)
 51 |     for i in keys:
 52 |         checkpoint['net'][i[7:]] = checkpoint['net'][i]
 53 |         del checkpoint['net'][i]
 54 | 
 55 |     net.load_state_dict(checkpoint['net'])
 56 |     best_loss = checkpoint['loss']
 57 |     start_epoch = checkpoint['epoch']
 58 | else:
 59 |     # Convert from pretrained VGG model.
 60 |     try:
 61 |         net.load_state_dict(torch.load('../model/ssd.pth'))
 62 |         print('==> Pretrain model read successfully')
 63 |     except:
 64 |         print('==> Pretrain model read failed or not existed, training from init')
 65 | 
 66 | criterion = MultiBoxLoss()
 67 | 
 68 | if use_cuda:
 69 |     net = torch.nn.DataParallel(net, device_ids=[0])
 70 |     net.cuda()
 71 |     cudnn.benchmark = True
 72 | 
 73 | optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4)
 74 | # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.25)
 75 | 
 76 | # Training
 77 | def train(epoch,prev_val_loss, last_saved):
 78 |     print('\nEpoch: %d' % epoch)
 79 |     net.train()
 80 |     train_loss = 0
 81 |     for batch_idx, (images, loc_targets, conf_targets) in enumerate(trainloader):
 82 |         if use_cuda:
 83 |             images = images.cuda()
 84 |             loc_targets = loc_targets.cuda()
 85 |             conf_targets = conf_targets.cuda()
 86 | 
 87 |         images = torch.tensor(images)
 88 |         loc_targets = torch.tensor(loc_targets)
 89 |         conf_targets = torch.tensor(conf_targets)
 90 | 
 91 |         optimizer.zero_grad()
 92 |         loc_preds, conf_preds = net(images)
 93 |         loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets)
 94 |         loss.backward()
 95 |         optimizer.step()
 96 |         # scheduler.step()
 97 | 
 98 |         train_loss += loss.item()
 99 | 
100 |         if batch_idx%100 == 0:
101 |             val_loss_tot = 0
102 |             for batch_idx_val, (images, loc_targets, conf_targets) in enumerate(valloader):
103 |                 if use_cuda:
104 |                     images = images.cuda()
105 |                     loc_targets = loc_targets.cuda()
106 |                     conf_targets = conf_targets.cuda()
107 | 
108 |                 images = torch.tensor(images)
109 |                 loc_targets = torch.tensor(loc_targets)
110 |                 conf_targets = torch.tensor(conf_targets)
111 | 
112 |                 loc_preds, conf_preds = net(images)
113 |                 val_loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets)
114 |                 val_loss_tot += val_loss.item()
115 | 
116 |             val_loss_tot /= (batch_idx_val+1)
117 |             if val_loss_tot < prev_val_loss:
118 |                 os.makedirs('checkpoint', exist_ok=True)
119 |                 torch.save({
120 |                     'epoch': epoch,
121 |                     'net': net.state_dict(), 
122 |                     'loss': loss,
123 |                 }, 'checkpoint/ckpt.pth')
124 |                 print("Saved.")
125 |                 prev_val_loss = val_loss_tot
126 |                 last_saved = [epoch, batch_idx]
127 | 
128 |         print('epoch: {}, batch_idx: {},loss: {}, train_loss: {}, best_val_loss: {}, last_saved: {}'.format(epoch, batch_idx, loss.item(), train_loss/(batch_idx+1), prev_val_loss, last_saved))
129 | 
130 |     return prev_val_loss, last_saved
131 | 
132 | 
133 | prev_val_loss = 999
134 | last_saved = [start_epoch,0]
135 | for epoch_num in range(start_epoch, start_epoch+epoch):
136 |     prev_val_loss, last_saved = train(epoch_num, prev_val_loss, last_saved)
137 | 


--------------------------------------------------------------------------------
/MDSSD_augment/scripts/augment.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import pandas as pd
 3 | import shutil
 4 | import os
 5 | import numpy as np
 6 | import glob
 7 | import xml.etree.ElementTree as ET
 8 | from xml.dom import minidom
 9 | import random
10 | 
11 | IMG_DIR = '../../../../VisDrone2019/dataset/VisDrone2018-DET-train/images/'
12 | ANNOT_DIR = '../../../../VisDrone2019/dataset/VisDrone2018-DET-train/annotations/'
13 | 
14 | for i in os.listdir(ANNOT_DIR):
15 | 	box = []
16 | 	labels = []
17 | 	with open(os.path.join(ANNOT_DIR,i)) as f:
18 | 		f = f.read().split("\n")
19 | 		f = f[:-1]
20 | 	num_objs = len(f)
21 | 
22 | 	for j in range(num_objs):
23 | 		f[j] = f[j].split(",")
24 | 		xmin = float(f[j][0])
25 | 		ymin = float(f[j][1])
26 | 		w = float(f[j][2])
27 | 		h = float(f[j][3])
28 | 
29 | 		box.append([xmin,ymin,w,h])
30 | 		labels.append(int(f[j][5]))
31 | 
32 | 	img = cv2.imread(IMG_DIR+i[:-4]+".jpg")
33 | 	box_new = box.copy()
34 | 	img_new = img.copy()
35 | 	# cv2.imshow("Image", img)
36 | 	# cv2.waitKey(0)
37 | 	
38 | 	for j in box:	
39 | 		if j[2]*j[3]<500:
40 | 			crop = img[int(j[1]):int(j[1]+j[3]),int(j[0]):int(j[0]+j[2])]
41 | 			x = random.randrange(0, img.shape[1],1)
42 | 			y = random.randrange(0, img.shape[0],1)
43 | 
44 | 			try:
45 | 				img_new[int(y):int(y+j[3]),int(x):int(x+j[2])] = crop
46 | 				box_new.append([x,y,j[2],j[3]])
47 | 			except:
48 | 				continue
49 | 	for j in box_new:
50 | 		img_new = cv2.rectangle(img_new,(int(j[0]),int(j[1])),(int(j[0]+j[2]),int(j[1]+j[3])),(255,0,0),1)
51 | 
52 | 	cv2.imshow("Image", img_new)
53 | 	cv2.waitKey(0)
54 | 	break


--------------------------------------------------------------------------------
/MDSSD_augment/scripts/datagen.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Load image/class/box from a annotation file.
  3 | 
  4 | '''
  5 | from __future__ import print_function
  6 | 
  7 | import os
  8 | import sys
  9 | import os.path
 10 | 
 11 | import random
 12 | import numpy as np
 13 | 
 14 | import torch
 15 | import torch.utils.data as data
 16 | import torchvision.transforms as transforms
 17 | 
 18 | from encoder import DataEncoder
 19 | import cv2
 20 | 
 21 | import pandas as pd
 22 | import shutil
 23 | import os
 24 | import numpy as np
 25 | import glob
 26 | import xml.etree.ElementTree as ET
 27 | from xml.dom import minidom
 28 | 
 29 | class ListDataset(data.Dataset):
 30 |     img_size = 300
 31 | 
 32 |     def __init__(self, root, list_file, train, transform):
 33 |         '''
 34 |         Args:
 35 |           root: (str) ditectory to images.
 36 |           list_file: (str) path to annotation files.
 37 |           train: (boolean) train or test.
 38 |           transform: ([transforms]) image transforms.
 39 |         '''
 40 |         self.root = root
 41 |         self.list_file = list_file
 42 |         self.train = train
 43 |         self.transform = transform
 44 | 
 45 |         self.fnames = []
 46 |         self.boxes = []
 47 |         self.labels = []
 48 | 
 49 |         self.data_encoder = DataEncoder()
 50 |         self.num_samples = 0
 51 | 
 52 |         for i in os.listdir(list_file):
 53 |             self.num_samples += 1
 54 |             self.fnames.append(i)        
 55 | 
 56 |     def __getitem__(self, idx):
 57 |         '''Load a image, and encode its bbox locations and class labels.
 58 |         Args:
 59 |           idx: (int) image index.
 60 |         Returns:
 61 |           img: (tensor) image tensor.
 62 |           loc_target: (tensor) location targets, sized [8732,4].
 63 |           conf_target: (tensor) label targets, sized [8732,].
 64 |         '''
 65 |         # Load image and bbox locations.
 66 |         fname = self.fnames[idx]
 67 |         img = cv2.imread(os.path.join(self.root, fname[:-4]+".jpg"))
 68 | 
 69 |         box = []
 70 |         label = []
 71 |         with open(os.path.join(self.list_file,fname)) as f:
 72 |             f = f.read().split("\n")
 73 |             f = f[:-1]
 74 |         num_objs = len(f)
 75 | 
 76 |         for j in range(num_objs):
 77 |             f[j] = f[j].split(",")
 78 |             xmin = float(f[j][0])
 79 |             ymin = float(f[j][1])
 80 |             w = float(f[j][2])
 81 |             h = float(f[j][3])
 82 | 
 83 |             box.append([xmin,ymin,xmin+w,ymin+h])
 84 |             label.append(int(f[j][5]))
 85 | 
 86 |         # **************************** AUGMENTATION ************************************
 87 |         # Copy and paste small objects at random locations in
 88 |         # image to increase the number of samples with small sizes.
 89 |         box_new = box.copy()
 90 |         label_new = label.copy()
 91 |         img_new = img.copy()
 92 |         for n in range(len(box)):
 93 |             j = box[n]   
 94 |             if j[2]*j[3]<500:
 95 |                 crop = img[int(j[1]):int(j[1]+j[3]),int(j[0]):int(j[0]+j[2])]
 96 |                 x = random.randrange(0, img.shape[1],1)
 97 |                 y = random.randrange(0, img.shape[0],1)
 98 | 
 99 |                 try:
100 |                     img_new[int(y):int(y+j[3]),int(x):int(x+j[2])] = crop
101 |                     box_new.append([x,y,j[2],j[3]])
102 |                     label_new.append(label[n])
103 |                 except:
104 |                     continue
105 | 
106 |         # ********************************************************************************
107 | 
108 |         self.boxes.append(torch.Tensor(box_new))
109 |         self.labels.append(torch.LongTensor(label_new))
110 |         img = img_new
111 |         
112 |         boxes = self.boxes[-1].clone()
113 |         labels = self.labels[-1]
114 | 
115 |         # Data augmentation while training.
116 |         if self.train:
117 |             img, boxes = self.random_flip(img, boxes)
118 |             img, boxes, labels = self.random_crop(img, boxes, labels)
119 | 
120 |         # Scale bbox locaitons to [0,1].
121 |         w,h = img.shape[1], img.shape[0]
122 |         boxes /= torch.Tensor([w,h,w,h]).expand_as(boxes)
123 |         img = cv2.resize(img, (self.img_size,self.img_size))
124 |         img = self.transform(img)
125 | 
126 |         # Encode loc & conf targets.
127 |         
128 |         loc_target, conf_target = self.data_encoder.encode(boxes, labels)
129 |         return img, loc_target, conf_target
130 | 
131 |     def random_flip(self, img, boxes):
132 |         '''Randomly flip the image and adjust the bbox locations.
133 |         For bbox (xmin, ymin, xmax, ymax), the flipped bbox is:
134 |         (w-xmax, ymin, w-xmin, ymax).
135 |         Args:
136 |           img: (ndarray.Image) image. f
137 |           boxes: (tensor) bbox locations, sized [#obj, 4].
138 |         Returns:
139 |           img: (ndarray.Image) randomly flipped image.
140 |           boxes: (tensor) randomly flipped bbox locations, sized [#obj, 4].
141 |         '''
142 |         if random.random() < 0.5:
143 |             img = cv2.flip(img, 1)
144 |             w = img.shape[1]
145 |             xmin = w - boxes[:,2]
146 |             xmax = w - boxes[:,0]
147 |             boxes[:,0] = xmin
148 |             boxes[:,2] = xmax
149 |         return img, boxes
150 | 
151 |     def random_crop(self, img, boxes, labels):
152 |         '''Randomly crop the image and adjust the bbox locations.
153 |         For more details, see 'Chapter2.2: Data augmentation' of the paper.
154 |         Args:
155 |           img: (ndarray.Image) image.
156 |           boxes: (tensor) bbox locations, sized [#obj, 4].
157 |           labels: (tensor) bbox labels, sized [#obj,].
158 |         Returns:
159 |           img: (ndarray.Image) cropped image.
160 |           selected_boxes: (tensor) selected bbox locations.
161 |           labels: (tensor) selected bbox labels.
162 |         '''
163 |         imw, imh = img.shape[1], img.shape[0]
164 |         while True:
165 |             min_iou = random.choice([None, 0.1, 0.3, 0.5, 0.7, 0.9])# random choice the one 
166 |             if min_iou is None:
167 |                 return img, boxes, labels
168 | 
169 |             for _ in range(100):
170 |                 w = random.randrange(int(0.1*imw), imw)
171 |                 h = random.randrange(int(0.1*imh), imh)
172 | 
173 |                 if h > 2*w or w > 2*h or h < 1 or w < 1:
174 |                     continue
175 | 
176 |                 x = random.randrange(imw - w)
177 |                 y = random.randrange(imh - h)
178 |                 roi = torch.Tensor([[x, y, x+w, y+h]])
179 |                 
180 |                 center = (boxes[:,:2] + boxes[:,2:]) / 2  # [N,2]
181 |                 roi2 = roi.expand(len(center), 4)  # [N,4]
182 |     
183 |                 mask = (center > roi2[:,:2]) & (center < roi2[:,2:])  # [N,2]
184 |                 mask = mask[:,0] & mask[:,1]  #[N,]
185 | 
186 |                 if not mask.any():
187 |                     continue
188 |               
189 |                 selected_boxes = boxes.index_select(0, mask.nonzero().squeeze(1))
190 |                 
191 |                 iou = self.data_encoder.iou(selected_boxes, roi)
192 |                 if iou.min() < min_iou:
193 |                     continue
194 |                 img = img[y:y+h, x:x+w, :]
195 |                 
196 |                 selected_boxes[:,0].add_(-x).clamp_(min=0, max=w)
197 |                 selected_boxes[:,1].add_(-y).clamp_(min=0, max=h)
198 |                 selected_boxes[:,2].add_(-x).clamp_(min=0, max=w)
199 |                 selected_boxes[:,3].add_(-y).clamp_(min=0, max=h)
200 | 
201 |                 return img, selected_boxes, labels[mask]
202 | 
203 |     def __len__(self):
204 |         return self.num_samples


--------------------------------------------------------------------------------
/MDSSD_augment/scripts/encoder.py:
--------------------------------------------------------------------------------
  1 | '''Encode target locations and labels.'''
  2 | import torch
  3 | 
  4 | import math
  5 | import itertools
  6 | 
  7 | class DataEncoder:
  8 |     def __init__(self):
  9 |         '''Compute default box sizes with scale and aspect transform.'''
 10 |         scale = 300.
 11 |         steps = [s / scale for s in (32, 64, 100, 300, 4, 8, 16)]
 12 |         sizes_ssd = [s / scale for s in (111, 162, 213, 264, 315)]
 13 |         sizes_fusion = [s / scale for s in (15, 30, 60, 111)]
 14 |         aspect_ratios = ((2,3), (2,3), (2,), (2,), (2,), (2,), (2,))
 15 |         feature_map_sizes = (10, 5, 3, 1, 75, 38, 19)
 16 |         num_layers = len(feature_map_sizes)
 17 | 
 18 |         boxes = []
 19 |         for i in range(num_layers):
 20 |             fmsize = feature_map_sizes[i] # feature map size 
 21 |             for h,w in itertools.product(range(fmsize), repeat=2):
 22 |                 cx = (w + 0.5)*steps[i]
 23 |                 cy = (h + 0.5)*steps[i]
 24 | 
 25 |                 if i<4:
 26 |                     s = sizes_ssd[i]
 27 |                     boxes.append((cx, cy, s, s))
 28 | 
 29 |                     s = math.sqrt(sizes_ssd[i] * sizes_ssd[i+1])
 30 |                     boxes.append((cx, cy, s, s))
 31 | 
 32 |                     s = sizes_ssd[i]
 33 |                     for ar in aspect_ratios[i]:
 34 |                         boxes.append((cx, cy, s * math.sqrt(ar), s / math.sqrt(ar)))
 35 |                         boxes.append((cx, cy, s / math.sqrt(ar), s * math.sqrt(ar)))
 36 | 
 37 |                 else:            
 38 |                     s = sizes_fusion[i-4]
 39 |                     boxes.append((cx, cy, s, s))
 40 | 
 41 |                     s = math.sqrt(sizes_fusion[i-4] * sizes_fusion[i-4+1])
 42 |                     boxes.append((cx, cy, s, s))
 43 | 
 44 |                     s = sizes_fusion[i-4]
 45 |                     for ar in aspect_ratios[i]:
 46 |                         boxes.append((cx, cy, s * math.sqrt(ar), s / math.sqrt(ar)))
 47 |                         boxes.append((cx, cy, s / math.sqrt(ar), s * math.sqrt(ar)))
 48 | 
 49 |         self.default_boxes = torch.Tensor(boxes)
 50 | 
 51 |     def iou(self, box1, box2):
 52 |         '''Compute the intersection over union of two set of boxes, each box is [x1,y1,x2,y2].
 53 | 
 54 |         Args:
 55 |           box1: (tensor) bounding boxes, sized [N,4].
 56 |           box2: (tensor) bounding boxes, sized [M,4].
 57 | 
 58 |         Return:
 59 |           (tensor) iou, sized [N,M].
 60 |         '''
 61 |         N = box1.size(0)
 62 |         M = box2.size(0)
 63 | 
 64 |         lt = torch.max(
 65 |             box1[:,:2].unsqueeze(1).expand(N,M,2),  # [N,2] -> [N,1,2] -> [N,M,2]
 66 |             box2[:,:2].unsqueeze(0).expand(N,M,2),  # [M,2] -> [1,M,2] -> [N,M,2]
 67 |         )
 68 | 
 69 |         rb = torch.min(
 70 |             box1[:,2:].unsqueeze(1).expand(N,M,2),  # [N,2] -> [N,1,2] -> [N,M,2]
 71 |             box2[:,2:].unsqueeze(0).expand(N,M,2),  # [M,2] -> [1,M,2] -> [N,M,2]
 72 |         )
 73 | 
 74 |         wh = rb - lt  # [N,M,2]
 75 |         wh[wh<0] = 0  # clip at 0
 76 |         inter = wh[:,:,0] * wh[:,:,1]  # [N,M]
 77 | 
 78 |         area1 = (box1[:,2]-box1[:,0]) * (box1[:,3]-box1[:,1])  # [N,]
 79 |         area2 = (box2[:,2]-box2[:,0]) * (box2[:,3]-box2[:,1])  # [M,]
 80 |         area1 = area1.unsqueeze(1).expand_as(inter)  # [N,] -> [N,1] -> [N,M]
 81 |         area2 = area2.unsqueeze(0).expand_as(inter)  # [M,] -> [1,M] -> [N,M]
 82 | 
 83 |         iou = inter / (area1 + area2 - inter)
 84 |         return iou
 85 | 
 86 |     def encode(self, boxes, classes, threshold=0.5):
 87 |         '''Transform target bounding boxes and class labels to SSD boxes and classes.
 88 | 
 89 |         Match each object box to all the default boxes, pick the ones with the
 90 |         Jaccard-Index > 0.5:
 91 |             Jaccard(A,B) = AB / (A+B-AB)
 92 | 
 93 |         Args:
 94 |           boxes: (tensor) object bounding boxes (xmin,ymin,xmax,ymax) of a image, sized [#obj, 4].
 95 |           classes: (tensor) object class labels of a image, sized [#obj,].
 96 |           threshold: (float) Jaccard index threshold
 97 | 
 98 |         Returns:
 99 |           boxes: (tensor) bounding boxes, sized [#obj, 8732, 4].
100 |           classes: (tensor) class labels, sized [8732,]
101 |         '''
102 |         default_boxes = self.default_boxes
103 |         num_default_boxes = default_boxes.size(0)
104 |         num_objs = boxes.size(0)
105 | 
106 |         iou = self.iou(  # [#obj,8732]
107 |             boxes,
108 |             torch.cat([default_boxes[:,:2] - default_boxes[:,2:]/2,
109 |                        default_boxes[:,:2] + default_boxes[:,2:]/2], 1)
110 |         )
111 | 
112 |         iou, max_idx = iou.max(0)  # [1,8732]
113 |         max_idx.squeeze_(0)        # [8732,]
114 |         iou.squeeze_(0)            # [8732,]
115 | 
116 |         boxes = boxes[max_idx]     # [8732,4]
117 |         variances = [0.1, 0.2]
118 |         cxcy = (boxes[:,:2] + boxes[:,2:])/2 - default_boxes[:,:2]  # [8732,2]
119 |         cxcy /= variances[0] * default_boxes[:,2:]
120 |         wh = (boxes[:,2:] - boxes[:,:2]) / default_boxes[:,2:]      # [8732,2]
121 |         wh = torch.log(wh) / variances[1]
122 |         loc = torch.cat([cxcy, wh], 1)  # [8732,4]
123 | 
124 |         conf = 1 + classes[max_idx]   # [8732,], background class = 0
125 |         conf[iou<threshold] = 0       # background
126 |         return loc, conf
127 | 
128 |     def nms(self, bboxes, scores, threshold=0.3, mode='union'):
129 |         '''Non maximum suppression.
130 |         Args:
131 |           bboxes: (tensor) bounding boxes, sized [N,4].
132 |           scores: (tensor) bbox scores, sized [N,].
133 |           threshold: (float) overlap threshold.
134 |           mode: (str) 'union' or 'min'.
135 | 
136 |         Returns:
137 |           keep: (tensor) selected indices.
138 | 
139 |         Ref:
140 |           https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py
141 |         '''
142 |         x1 = bboxes[:,0]
143 |         y1 = bboxes[:,1]
144 |         x2 = bboxes[:,2]
145 |         y2 = bboxes[:,3]
146 | 
147 |         areas = (x2-x1) * (y2-y1)
148 |         _, order = scores.sort(0, descending=True)
149 |         order = order.numpy()
150 | 
151 |         keep = []
152 |         while order.size > 0: 
153 |             try:
154 |                 i = order[0]
155 |             except:
156 |                 i = order
157 |             keep.append(i)
158 | 
159 |             if order.size == 1:
160 |                 break
161 | 
162 |             xx1 = x1[order[1:]].clamp(min=x1[i])
163 |             yy1 = y1[order[1:]].clamp(min=y1[i])
164 |             xx2 = x2[order[1:]].clamp(max=x2[i])
165 |             yy2 = y2[order[1:]].clamp(max=y2[i])
166 | 
167 |             w = (xx2-xx1).clamp(min=0)
168 |             h = (yy2-yy1).clamp(min=0)
169 |             inter = w*h
170 | 
171 |             if mode == 'union':
172 |                 ovr = inter / (areas[i] + areas[order[1:]] - inter)
173 |             elif mode == 'min':
174 |                 ovr = inter / areas[order[1:]].clamp(max=areas[i])
175 |             else:
176 |                 raise TypeError('Unknown nms mode: %s.' % mode)
177 | 
178 |             ids = (ovr<=threshold).nonzero().squeeze()
179 |             if ids.size == 0:
180 |                 break
181 |             order = order[ids+1]
182 |         return torch.LongTensor(keep)
183 | 
184 |     def decode(self, loc, conf):
185 |         '''Transform predicted loc/conf back to real bbox locations and class labels.
186 | 
187 |         Args:
188 |           loc: (tensor) predicted loc, sized [8732,4].
189 |           conf: (tensor) predicted conf, sized [8732,21].
190 | 
191 |         Returns:
192 |           boxes: (tensor) bbox locations, sized [#obj, 4].
193 |           labels: (tensor) class labels, sized [#obj,1].
194 |         '''
195 |         variances = (0.1, 0.2)
196 |         wh = torch.exp(loc[:,2:]*variances[1]) * self.default_boxes[:,2:]
197 |         cxcy = loc[:,:2] * variances[0] * self.default_boxes[:,2:] + self.default_boxes[:,:2]
198 |         box_preds = torch.cat([cxcy-wh/2, cxcy+wh/2], 1)  # [8732,4]
199 | 
200 |         boxes = []
201 |         labels = []
202 |         scores = []
203 |         num_classes = conf.size(1)
204 |         for i in range(num_classes-1):
205 |             score = conf[:,i+1]  # class i corresponds to (i+1) column
206 |             mask = score > 0.1
207 |             
208 |             if not mask.any():
209 |                 continue
210 |            
211 |             box = box_preds[mask.nonzero().squeeze()]
212 |             score = score[mask]
213 |             
214 |             if len(score) == 1:
215 |                 continue
216 |             keep = self.nms(box, score, threshold=0.3)
217 |             boxes.append(box[keep])
218 |             labels.append(torch.LongTensor(len(box[keep])).fill_(i))
219 |             scores.append(score[keep])
220 | 
221 |         return boxes, labels, scores
222 | 


--------------------------------------------------------------------------------
/MDSSD_augment/scripts/fusion.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import torch.nn.init as init
 5 | 
 6 | from torch.autograd import Variable
 7 | 
 8 | from norm import L2Norm
 9 | 
10 | class FusionBlock(nn.Module):
11 |     def __init__(self, big_features, small_features):
12 |         super(FusionBlock, self).__init__()
13 |         
14 |         # Bigger feature map
15 |         self.conv1_1 = nn.Conv2d(big_features, 256, kernel_size=3, padding=1, dilation=1)
16 |         self.Norm1 = L2Norm(256, 20)
17 | 
18 |         # Smaller feature map
19 |         self.deconv2_1 = nn.ConvTranspose2d(small_features, 256, 2, stride=2, dilation=1)
20 |         self.conv2_1 = nn.Conv2d(256, 256, kernel_size=3, padding=1, dilation=1)
21 |         self.deconv2_2 = nn.ConvTranspose2d(256, 256, 2, stride=2, dilation=1)
22 |         self.conv2_2 = nn.Conv2d(256, 256, kernel_size=3, padding=1, dilation=1)
23 |         self.deconv2_3 = nn.ConvTranspose2d(256, 256, 3, stride=2, dilation=1)
24 |         self.conv2_3 = nn.Conv2d(256, 256, kernel_size=3, padding=1, dilation=1)
25 |         self.Norm2 = L2Norm(256, 20)
26 | 
27 |         # Common
28 |         self.conv3_1 = nn.Conv2d(256, big_features, kernel_size=3, padding=1, dilation=1)
29 | 
30 |         
31 |     def forward(self, big, small):
32 |         h1 = self.conv1_1(big)
33 |         h1 = self.Norm1(h1)
34 | 
35 |         h2 = self.deconv2_1(small)
36 |         # print(h2.size())
37 |         h2 = F.relu(self.conv2_1(h2))
38 |         # print(h2.size())
39 |         h2 = self.deconv2_2(h2)
40 |         # print(h2.size())
41 |         h2 = F.relu(self.conv2_2(h2))
42 |         # print(h2.size())
43 |         h2 = self.deconv2_3(h2)
44 |         # print(h2.size())
45 |         h2 = self.conv2_3(h2)
46 |         # print(h2.size())
47 |         h2 = self.Norm2(h2)
48 | 
49 |         size = h2.size()[3]
50 |         diff_odd = h2.size()[-1] - h1.size()[-1]
51 |         h2 = h2[:,:,(int(diff_odd/2)+diff_odd%2):(size-int(diff_odd/2)),(int(diff_odd/2)+diff_odd%2):(size-int(diff_odd/2))]
52 | 
53 |         # print(h1.size(), h2.size())
54 |         h = F.relu(h1+h2)
55 |         h = F.relu(self.conv3_1(h))
56 | 
57 |         return h 
58 | 
59 | if __name__ == '__main__':
60 |     big = torch.randn(1, 256, 128, 128)
61 |     small = torch.rand(1,512,16,16)
62 |     net = FusionBlock(256,512)
63 | 
64 |                
65 |         
66 | 


--------------------------------------------------------------------------------
/MDSSD_augment/scripts/mdssd.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import itertools
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | import torch.nn.init as init
  8 | 
  9 | from torch.autograd import Variable
 10 | 
 11 | from multibox_layer import MultiBoxLayer
 12 | from fusion import FusionBlock
 13 | from norm import L2Norm
 14 | 
 15 | class MDSSD300(nn.Module):
 16 |     input_size = 300
 17 | 
 18 |     def __init__(self):
 19 |         super(MDSSD300, self).__init__()
 20 | 		
 21 | 		# model
 22 |         self.base = self.VGG16()
 23 |         self.norm4 = L2Norm(512, 20) # 38
 24 | 
 25 |         self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1)
 26 |         self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1)
 27 |         self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1)
 28 | 
 29 |         self.conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
 30 | 
 31 |         self.conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 
 32 | 
 33 |         self.conv8_1 = nn.Conv2d(1024, 256, kernel_size=1)
 34 |         self.conv8_2 = nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2)
 35 | 
 36 |         self.conv9_1 = nn.Conv2d(512, 128, kernel_size=1)
 37 |         self.conv9_2 = nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2)
 38 | 
 39 |         self.conv10_1 = nn.Conv2d(256, 128, kernel_size=1)
 40 |         self.conv10_2 = nn.Conv2d(128, 256, kernel_size=3)
 41 | 
 42 |         self.conv11_1 = nn.Conv2d(256, 128, kernel_size=1)
 43 |         self.conv11_2 = nn.Conv2d(128, 256, kernel_size=3)
 44 | 
 45 |         self.Fusion1 = FusionBlock(256,512)
 46 |         self.Fusion2 = FusionBlock(512,256)
 47 |         self.Fusion3 = FusionBlock(1024,256)
 48 | 
 49 |         # multibox layer
 50 |         self.multibox = MultiBoxLayer()
 51 | 
 52 |     def forward(self, x):
 53 |         odd_count = 0
 54 |         odd = []
 55 |         hs = []
 56 |         vgg = []
 57 |         fusion_layers = []
 58 |         h = self.base[0](x)
 59 |         vgg.append(h)
 60 |         for i in range(1,len(self.base)):
 61 |             h = self.base[i](h)
 62 |             vgg.append(h)
 63 |         fusion_layers.append(vgg[15])
 64 |         odd.append(2)
 65 |         odd_count = 3
 66 |         fusion_layers.append(h)
 67 |         h = F.max_pool2d(h, kernel_size=2, stride=2, ceil_mode=True)
 68 | 
 69 |         h = F.relu(self.conv5_1(h))
 70 |         h = F.relu(self.conv5_2(h))
 71 |         h = F.relu(self.conv5_3(h))
 72 |         h = F.max_pool2d(h, kernel_size=3, padding=1, stride=1, ceil_mode=True)
 73 |         
 74 |         h = F.relu(self.conv6(h))
 75 |         h = F.relu(self.conv7(h))
 76 |         fusion_layers.append(h)
 77 | 
 78 |         h = F.relu(self.conv8_1(h))
 79 |         h = F.relu(self.conv8_2(h))
 80 |         hs.append(h)  # conv8_2
 81 | 
 82 |         h = F.relu(self.conv9_1(h))
 83 |         h = F.relu(self.conv9_2(h))
 84 |         hs.append(h)  # conv9_2
 85 | 
 86 |         h = F.relu(self.conv10_1(h))
 87 |         h = F.relu(self.conv10_2(h))
 88 |         hs.append(h)  # conv10_2
 89 | 
 90 |         h = F.relu(self.conv11_1(h))
 91 |         h = F.relu(self.conv11_2(h))
 92 |         hs.append(h)  # conv11_2
 93 | 
 94 |         # Fusion Blocks
 95 |         f = self.Fusion1(fusion_layers[0],hs[-4])
 96 |         hs.append(f)
 97 |         f = self.Fusion2(fusion_layers[1],hs[-4])
 98 |         hs.append(f)
 99 |         diff_odd = fusion_layers[2].size()[-1] - hs[-4].size()[-1]
100 |         f = self.Fusion3(fusion_layers[2],hs[-4])
101 |         hs.append(f)
102 | 
103 |         loc_preds, conf_preds = self.multibox(hs)
104 |  
105 |         return loc_preds, conf_preds
106 | 
107 |     def VGG16(self):
108 |         '''VGG16 layers.'''
109 |         cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512]
110 |         layers = []
111 |         in_channels = 3
112 |         for x in cfg:
113 |             if x == 'M':
114 |                 layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
115 |             else:
116 |                 layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
117 |                            nn.ReLU(True)]
118 |                 in_channels = x
119 |         return nn.Sequential(*layers)
120 | 
121 | if __name__ == '__main__':
122 |     t = torch.randn(1, 3, 300, 300)
123 |     net = MDSSD300()
124 |     # print(net)
125 |     res = net.forward(t)
126 |     


--------------------------------------------------------------------------------
/MDSSD_augment/scripts/multibox_layer.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import math
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.init as init
 8 | import torch.nn.functional as F
 9 | 
10 | 
11 | class MultiBoxLayer(nn.Module):
12 | 	num_classes = 13
13 | 	num_anchors = [6,6,4,4,4,4,4]
14 | 	
15 | 	def __init__(self):
16 | 		super(MultiBoxLayer, self).__init__()
17 | 		self.in_planes = [512,256,256,256,256,512,1024]
18 | 		self.loc_layers = nn.ModuleList()
19 | 		self.conf_layers = nn.ModuleList()
20 | 		for i in range(len(self.in_planes)):
21 | 			self.loc_layers.append(nn.Conv2d(self.in_planes[i], self.num_anchors[i]*4, kernel_size=3, padding=1))
22 | 			self.conf_layers.append(nn.Conv2d(self.in_planes[i], self.num_anchors[i]*13, kernel_size=3, padding=1))
23 | 
24 | 	def forward(self, xs):
25 | 		'''
26 | 		Args:
27 | 		  xs: (list) of tensor containing intermediate layer outputs.
28 | 
29 | 		Returns:
30 | 		  loc_preds: (tensor) predicted locations, sized [N,8732,4].
31 | 		  conf_preds: (tensor) predicted class confidences, sized [N,8732,21].
32 | 		'''
33 | 		y_locs = []
34 | 		y_confs = []
35 | 		for i, x in enumerate(xs):
36 | 			y_loc = self.loc_layers[i](x)
37 | 			N = y_loc.size(0)
38 | 			y_loc = y_loc.permute(0,2,3,1).contiguous()
39 | 			y_loc = y_loc.view(N,-1,4)
40 | 			y_locs.append(y_loc)
41 | 
42 | 			y_conf = self.conf_layers[i](x)
43 | 			y_conf = y_conf.permute(0,2,3,1).contiguous()
44 | 			y_conf = y_conf.view(N,-1,13)
45 | 			y_confs.append(y_conf)
46 | 
47 | 		loc_preds = torch.cat(y_locs, 1)
48 | 		conf_preds = torch.cat(y_confs, 1)
49 | 		return loc_preds, conf_preds
50 | 


--------------------------------------------------------------------------------
/MDSSD_augment/scripts/multibox_loss.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import math
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.init as init
  8 | import torch.nn.functional as F
  9 | 
 10 | 
 11 | class MultiBoxLoss(nn.Module):
 12 |     num_classes = 13
 13 | 
 14 |     def __init__(self):
 15 |         super(MultiBoxLoss, self).__init__()
 16 | 
 17 |     def cross_entropy_loss(self, x, y):
 18 |         '''Cross entropy loss w/o averaging across all samples.
 19 | 
 20 |         Args:
 21 |           x: (tensor) sized [N,D].
 22 |           y: (tensor) sized [N,].
 23 | 
 24 |         Return:
 25 |           (tensor) cross entroy loss, sized [N,].
 26 |         '''
 27 |         xmax = x.data.max()
 28 |         print('x y size {} {}'.format(x.size(), y.size()))
 29 |         log_sum_exp = torch.log(torch.sum(torch.exp(x-xmax), 1)) + xmax
 30 |         print('log_sum_exp {}'.format(log_sum_exp.size()))
 31 |         return log_sum_exp - x.gather(1, y.view(-1,1))
 32 | 
 33 |     def hard_negative_mining(self, conf_loss, pos):
 34 |         '''Return negative indices that is 3x the number as postive indices.
 35 | 
 36 |         Args:
 37 |           conf_loss: (tensor) cross entroy loss between conf_preds and conf_targets, sized [N*8732,].
 38 |           pos: (tensor) positive(matched) box indices, sized [N,8732].
 39 | 
 40 |         Return:
 41 |           (tensor) negative indices, sized [N,8732].
 42 |         '''
 43 |         batch_size, num_boxes = pos.size()
 44 |         conf_loss[pos.view(-1)] = 0  # set pos boxes = 0, the rest are neg conf_loss
 45 |         conf_loss = conf_loss.view(batch_size, -1)  # [N,8732]
 46 | 
 47 |         _,idx = conf_loss.sort(1, descending=True)  # sort by neg conf_loss
 48 |         _,rank = idx.sort(1)  # [N,8732]
 49 | 
 50 |         num_pos = pos.long().sum(1)  # [N,1]
 51 |         num_neg = torch.clamp(3*num_pos, max=num_boxes-1)  # [N,1]
 52 | 
 53 |         neg = rank < num_neg.unsqueeze(1).expand_as(rank)
 54 | 
 55 |         return neg
 56 | 
 57 |     def forward(self, loc_preds, loc_targets, conf_preds, conf_targets):
 58 |         '''Compute loss between (loc_preds, loc_targets) and (conf_preds, conf_targets).
 59 | 
 60 |         Args:
 61 |           loc_preds: (tensor) predicted locations, sized [batch_size, 8732, 4].
 62 |           loc_targets: (tensor) encoded target locations, sized [batch_size, 8732, 4].
 63 |           conf_preds: (tensor) predicted class confidences, sized [batch_size, 8732, num_classes].
 64 |           conf_targets: (tensor) encoded target classes, sized [batch_size, 8732].
 65 | 
 66 |         loss:
 67 |           (tensor) loss = SmoothL1Loss(loc_preds, loc_targets) + CrossEntropyLoss(conf_preds, conf_targets).
 68 |         '''
 69 |         batch_size, num_boxes, _ = loc_preds.size()
 70 |         pos = conf_targets > 0  # [N,8732], pos means the box matched.
 71 |         num_matched_boxes = pos.data.float().sum()
 72 |         if num_matched_boxes == 0:
 73 |             return torch.tensor([0.], requires_grad=True)
 74 |         
 75 |         ################################################################
 76 |         # loc_loss = SmoothL1Loss(pos_loc_preds, pos_loc_targets)
 77 |         ################################################################
 78 |         pos_mask = pos.unsqueeze(2).expand_as(loc_preds)    # [N,8732,4]
 79 |         pos_loc_preds = loc_preds[pos_mask].view(-1,4)      # [#pos,4]
 80 |         pos_loc_targets = loc_targets[pos_mask].view(-1,4)  # [#pos,4]
 81 |         
 82 |         loc_loss = F.smooth_l1_loss(pos_loc_preds, pos_loc_targets, size_average=False)
 83 | 
 84 |         ################################################################
 85 |         # conf_loss = CrossEntropyLoss(pos_conf_preds, pos_conf_targets)
 86 |         #           + CrossEntropyLoss(neg_conf_preds, neg_conf_targets)
 87 |         ################################################################
 88 |         conf_preds = conf_preds.contiguous()
 89 |         conf_loss = F.cross_entropy(conf_preds.view(-1,self.num_classes), \
 90 |                                             conf_targets.view(-1), reduce=False)  # [N*8732,]
 91 |         neg = self.hard_negative_mining(conf_loss, pos)    # [N,8732]
 92 | 
 93 |         pos_mask = pos.unsqueeze(2).expand_as(conf_preds)  # [N,8732,21]
 94 |         neg_mask = neg.unsqueeze(2).expand_as(conf_preds)  # [N,8732,21]
 95 |         mask = (pos_mask+neg_mask).gt(0)
 96 | 
 97 |         pos_and_neg = (pos+neg).gt(0)
 98 |         preds = conf_preds[mask].view(-1,self.num_classes)  # [#pos+#neg,21]
 99 |         targets = conf_targets[pos_and_neg]                 # [#pos+#neg,]
100 |         conf_loss = F.cross_entropy(preds, targets, size_average=False)
101 | 
102 |         loc_loss /= num_matched_boxes
103 |         conf_loss /= num_matched_boxes
104 | 
105 |         return loc_loss + conf_loss
106 | 


--------------------------------------------------------------------------------
/MDSSD_augment/scripts/norm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import torch.nn.init as init
 5 | 
 6 | class L2Norm(nn.Module):
 7 |     '''L2Norm layer across all channels and scale.'''
 8 |     def __init__(self, in_features,scale):
 9 |         super(L2Norm, self).__init__()
10 |         self.weight = nn.Parameter(torch.Tensor(in_features))
11 |         self.reset_parameters(scale)
12 | 
13 |     def reset_parameters(self, scale):
14 |         nn.init.constant_(self.weight, scale)
15 | 
16 |     def forward(self, x):
17 |         x = F.normalize(x, dim=1)
18 |         scale = self.weight[None,:,None,None]
19 |         return scale * x


--------------------------------------------------------------------------------
/MDSSD_augment/scripts/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchvision
 3 | import torch.nn.functional as F
 4 | import torchvision.transforms as transforms
 5 | 
 6 | import sys
 7 | from mdssd import MDSSD300
 8 | from encoder import DataEncoder
 9 | import cv2
10 | 
11 | VOC_LABELS = (
12 |     'ignored regions',
13 |     'pedestrian',
14 |     'people',
15 |     'bicycle',
16 |     'car',
17 |     'van',
18 |     'truck',
19 |     'tricycle',
20 |     'awning-tricycle',
21 |     'bus',
22 |     'motor',
23 |     'other'
24 | )
25 | 
26 | 
27 | # Load model
28 | net = MDSSD300()
29 | checkpoint = torch.load('./checkpoint/ckpt.pth')
30 | 
31 | keys = []
32 | for k,v in checkpoint['net'].items():
33 |     if "module" in k:
34 |         keys.append(k)
35 | for i in keys:
36 |     checkpoint['net'][i[7:]] = checkpoint['net'][i]
37 |     del checkpoint['net'][i]
38 | 
39 | net.load_state_dict(checkpoint['net'])
40 | net.eval()
41 | 
42 | if len(sys.argv) == 2:
43 |     img_path = sys.argv[1]
44 | else:
45 |     img_path = './images/img5.jpg'
46 | # Load test image
47 | img = cv2.imread(img_path)
48 | 
49 | img1 = cv2.resize(img, (300, 300))
50 | transform = transforms.Compose([transforms.ToTensor(),
51 |                                 transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))])
52 | img1 = transform(img1)
53 | 
54 | # Forward
55 | with torch.no_grad():
56 |     x = torch.tensor(img1)
57 |     loc_preds, conf = net(x.unsqueeze(0))
58 | # Decode
59 | data_encoder = DataEncoder()
60 | boxes, labels, scores = data_encoder.decode(loc_preds.data.squeeze(0), F.softmax(conf.squeeze(0), dim=1).data)
61 | for box, label, score in zip(boxes, labels, scores):
62 |     for b, s in zip(box, score):
63 |         if s > 0.25:#0.7:
64 |             b[::2] *= img.shape[1]
65 |             b[1::2] *= img.shape[0]
66 |             print('label:',VOC_LABELS[int(label[0])], 'score:', score)
67 |             b = list(b)
68 |             cv2.rectangle(img, (b[0], b[1]), (b[2], b[3]), (255, 255, 255), 2)
69 |             title = '{}: {}'.format(VOC_LABELS[int(label[0])], round(float(score[0]), 2))
70 |             cv2.putText(img, title, (b[0], b[1]), cv2.FONT_ITALIC, 0.6, (0, 255, 0), 2)
71 |             cv2.imshow('img', img)
72 | cv2.waitKey(0)


--------------------------------------------------------------------------------
/MDSSD_augment/scripts/train.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import os
  4 | import argparse
  5 | import itertools
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.optim as optim
 10 | import torch.nn.functional as F
 11 | import torch.backends.cudnn as cudnn
 12 | 
 13 | import torchvision
 14 | import torchvision.transforms as transforms
 15 | 
 16 | import numpy as np
 17 | from mdssd import MDSSD300
 18 | from datagen import ListDataset
 19 | from multibox_loss import MultiBoxLoss
 20 | 
 21 | TRAIN_IMG_DIR = '../../../../VisDrone2019/dataset/VisDrone2018-DET-train/images/'
 22 | TRAIN_ANNOT_DIR = '../../../../VisDrone2019/dataset/VisDrone2018-DET-train/annotations/'
 23 | VAL_IMAGE_DIR = '../../../../VisDrone2019/dataset/VisDrone2019-DET-val/images/'
 24 | VAL_ANNOT_DIR = '../../../../VisDrone2019/dataset/VisDrone2019-DET-val/annotations/'
 25 | 
 26 | lr = 0.001
 27 | resume = False  # Resume from checkpoint
 28 | epoch = 200
 29 | batch_size = 8
 30 | 
 31 | use_cuda = torch.cuda.is_available()
 32 | best_loss = float('inf')  # best test loss
 33 | start_epoch = 0  # start from epoch 0 or last epoch
 34 | 
 35 | # Data
 36 | print('==> Preparing data..')
 37 | transform = transforms.Compose([transforms.ToTensor(),
 38 |                             transforms.Normalize(mean=(0.356, 0.368, 0.362), std=(0.242, 0.235, 0.236))])
 39 | 
 40 | trainset = ListDataset(root=TRAIN_IMG_DIR, list_file=TRAIN_ANNOT_DIR, train=True, transform=transform)
 41 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=8, shuffle=True, num_workers=4)
 42 | 
 43 | valset = ListDataset(root=VAL_IMAGE_DIR, list_file=VAL_ANNOT_DIR, train=True, transform=transform)
 44 | valloader = torch.utils.data.DataLoader(valset, batch_size=8, shuffle=True, num_workers=4)
 45 | 
 46 | # Model
 47 | net = MDSSD300()
 48 | if resume:
 49 |     print('==> Resuming from checkpoint..')
 50 |     checkpoint = torch.load('./checkpoint/ckpt.pth')
 51 | 
 52 |     keys = []
 53 |     for k,v in checkpoint['net'].items():
 54 |         if "module" in k:
 55 |             keys.append(k)
 56 |     for i in keys:
 57 |         checkpoint['net'][i[7:]] = checkpoint['net'][i]
 58 |         del checkpoint['net'][i]
 59 | 
 60 |     net.load_state_dict(checkpoint['net'])
 61 |     best_loss = checkpoint['loss']
 62 |     start_epoch = checkpoint['epoch']
 63 | else:
 64 |     # Convert from pretrained VGG model.
 65 |     try:
 66 |         net.load_state_dict(torch.load('../model/ssd.pth'))
 67 |         print('==> Pretrain model read successfully')
 68 |     except:
 69 |         print('==> Pretrain model read failed or not existed, training from init')
 70 | 
 71 | criterion = MultiBoxLoss()
 72 | 
 73 | if use_cuda:
 74 |     net = torch.nn.DataParallel(net, device_ids=[0])
 75 |     net.cuda()
 76 |     cudnn.benchmark = True
 77 | 
 78 | optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4)
 79 | # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.25)
 80 | 
 81 | # Training
 82 | def train(epoch,prev_val_loss, last_saved):
 83 |     print('\nEpoch: %d' % epoch)
 84 |     net.train()
 85 |     train_loss = 0
 86 |     for batch_idx, (images, loc_targets, conf_targets) in enumerate(trainloader):
 87 |         if use_cuda:
 88 |             images = images.cuda()
 89 |             loc_targets = loc_targets.cuda()
 90 |             conf_targets = conf_targets.cuda()
 91 | 
 92 |         images = torch.tensor(images)
 93 |         loc_targets = torch.tensor(loc_targets)
 94 |         conf_targets = torch.tensor(conf_targets)
 95 | 
 96 |         optimizer.zero_grad()
 97 |         loc_preds, conf_preds = net(images)
 98 |         loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets)
 99 |         loss.backward()
100 |         optimizer.step()
101 |         # scheduler.step()
102 |         train_loss += loss.item()
103 |  		
104 |  		if batch_idx%100 == 0:
105 |             val_loss_tot = 0
106 |             for batch_idx_val, (images, loc_targets, conf_targets) in enumerate(valloader):
107 |                 if use_cuda:
108 |                     images = images.cuda()
109 |                     loc_targets = loc_targets.cuda()
110 |                     conf_targets = conf_targets.cuda()
111 | 
112 |                 images = torch.tensor(images)
113 |                 loc_targets = torch.tensor(loc_targets)
114 |                 conf_targets = torch.tensor(conf_targets)
115 | 
116 |                 loc_preds, conf_preds = net(images)
117 |                 val_loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets)
118 |                 val_loss_tot += val_loss.item()
119 | 
120 |             val_loss_tot /= (batch_idx_val+1)
121 |             if val_loss_tot < prev_val_loss:
122 |                 os.makedirs('checkpoint', exist_ok=True)
123 |                 torch.save({
124 |                     'epoch': epoch,
125 |                     'net': net.state_dict(), 
126 |                     'loss': loss,
127 |                 }, 'checkpoint/ckpt.pth')
128 |                 print("Saved.")
129 |                 prev_val_loss = val_loss_tot
130 |                 last_saved = [epoch, batch_idx]
131 |         # print('epoch: {}, batch_idx: {},loss: {}, train_loss: {}'.format(epoch, batch_idx, loss.item(), train_loss/(batch_idx+1)))
132 |         print('epoch: {}, batch_idx: {},loss: {}, train_loss: {}, best_val_loss: {}, last_saved: {}'.format(epoch, batch_idx, loss.item(), train_loss/(batch_idx+1), prev_val_loss, last_saved))
133 |         
134 |     return prev_val_loss, last_saved
135 | 
136 | 
137 | prev_val_loss = 999
138 | last_saved = [start_epoch,0]
139 | for epoch_num in range(start_epoch, start_epoch+epoch):
140 |     prev_val_loss, last_saved = train(epoch_num, prev_val_loss, last_saved)
141 | 


--------------------------------------------------------------------------------
/MDSSD_with_self_attention/scripts/attention.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import torch.nn.init as init
 5 | 
 6 | class AttentionBlock(nn.Module):
 7 |     def __init__(self, channels):
 8 |         super(AttentionBlock, self).__init__()
 9 | 
10 |         self.f = nn.Conv2d(channels, channels, kernel_size=3, padding=1, dilation=1)
11 |         self.g = nn.Conv2d(channels, channels, kernel_size=3, padding=1, dilation=1)
12 |         self.h = nn.Conv2d(channels, channels, kernel_size=3, padding=1, dilation=1)
13 | 
14 |     def forward(self, x):
15 |     	f = self.f(x)
16 |     	f = torch.transpose(f,-2,-1)
17 |     	g = self.g(x)
18 |     	h = self.h(x)
19 | 
20 |     	attention_map = torch.mul(f,g)
21 |     	out = torch.mul(h, attention_map)
22 |     	return out
23 | 
24 | 
25 | if __name__ == "__main__":
26 | 	x = torch.rand(1,3,300,300)
27 | 	att = AttentionBlock(3)
28 | 	x = att(x)
29 | 	print(x.size())
30 | 


--------------------------------------------------------------------------------
/MDSSD_with_self_attention/scripts/datagen.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Load image/class/box from a annotation file.
  3 | 
  4 | '''
  5 | from __future__ import print_function
  6 | 
  7 | import os
  8 | import sys
  9 | import os.path
 10 | 
 11 | import random
 12 | import numpy as np
 13 | 
 14 | import torch
 15 | import torch.utils.data as data
 16 | import torchvision.transforms as transforms
 17 | 
 18 | from encoder import DataEncoder
 19 | import cv2
 20 | 
 21 | import pandas as pd
 22 | import shutil
 23 | import os
 24 | import numpy as np
 25 | import glob
 26 | import xml.etree.ElementTree as ET
 27 | from xml.dom import minidom
 28 | 
 29 | class ListDataset(data.Dataset):
 30 |     img_size = 300
 31 | 
 32 |     def __init__(self, root, list_file, train, transform):
 33 |         '''
 34 |         Args:
 35 |           root: (str) ditectory to images.
 36 |           list_file: (str) path to annotation files.
 37 |           train: (boolean) train or test.
 38 |           transform: ([transforms]) image transforms.
 39 |         '''
 40 |         self.root = root
 41 |         self.list_file = list_file
 42 |         self.train = train
 43 |         self.transform = transform
 44 | 
 45 |         self.fnames = []
 46 |         self.boxes = []
 47 |         self.labels = []
 48 | 
 49 |         self.data_encoder = DataEncoder()
 50 |         self.num_samples = 0
 51 | 
 52 |         # VisDrone
 53 | 
 54 |         for i in os.listdir(list_file):
 55 |             self.num_samples += 1
 56 |             self.fnames.append(i)        
 57 | 
 58 |     def __getitem__(self, idx):
 59 |         '''Load a image, and encode its bbox locations and class labels.
 60 |         Args:
 61 |           idx: (int) image index.
 62 |         Returns:
 63 |           img: (tensor) image tensor.
 64 |           loc_target: (tensor) location targets, sized [8732,4].
 65 |           conf_target: (tensor) label targets, sized [8732,].
 66 |         '''
 67 |         # Load image and bbox locations.
 68 |         fname = self.fnames[idx]
 69 |         img = cv2.imread(os.path.join(self.root, fname[:-4]+".jpg"))
 70 | 
 71 |         box = []
 72 |         label = []
 73 |         with open(os.path.join(self.list_file,fname)) as f:
 74 |             f = f.read().split("\n")
 75 |             f = f[:-1]
 76 |         num_objs = len(f)
 77 | 
 78 |         for j in range(num_objs):
 79 |             f[j] = f[j].split(",")
 80 |             xmin = float(f[j][0])
 81 |             ymin = float(f[j][1])
 82 |             w = float(f[j][2])
 83 |             h = float(f[j][3])
 84 | 
 85 |             box.append([xmin,ymin,xmin+w,ymin+h])
 86 |             label.append(int(f[j][5]))
 87 | 
 88 |         # **************************** AUGMENTATION ************************************
 89 |         # Copy and paste small objects at random locations in
 90 |         # image to increase the number of samples with small sizes.
 91 |         box_new = box.copy()
 92 |         label_new = label.copy()
 93 |         img_new = img.copy()
 94 |         for n in range(len(box)):
 95 |             j = box[n]   
 96 |             if j[2]*j[3]<500:
 97 |                 crop = img[int(j[1]):int(j[1]+j[3]),int(j[0]):int(j[0]+j[2])]
 98 |                 x = random.randrange(0, img.shape[1],1)
 99 |                 y = random.randrange(0, img.shape[0],1)
100 | 
101 |                 try:
102 |                     img_new[int(y):int(y+j[3]),int(x):int(x+j[2])] = crop
103 |                     box_new.append([x,y,j[2],j[3]])
104 |                     label_new.append(label[n])
105 |                 except:
106 |                     continue
107 | 
108 |         # ********************************************************************************
109 | 
110 |         self.boxes.append(torch.Tensor(box_new))
111 |         self.labels.append(torch.LongTensor(label_new))
112 |         img = img_new
113 |         
114 |         boxes = self.boxes[-1].clone()
115 |         labels = self.labels[-1]
116 | 
117 |         # Data augmentation while training.
118 |         if self.train:
119 |             img, boxes = self.random_flip(img, boxes)
120 |             img, boxes, labels = self.random_crop(img, boxes, labels)
121 | 
122 |         # Scale bbox locaitons to [0,1].
123 |         w,h = img.shape[1], img.shape[0]
124 |         boxes /= torch.Tensor([w,h,w,h]).expand_as(boxes)
125 |         img = cv2.resize(img, (self.img_size,self.img_size))
126 |         img = self.transform(img)
127 | 
128 |         # Encode loc & conf targets.
129 |         
130 |         loc_target, conf_target = self.data_encoder.encode(boxes, labels)
131 |         return img, loc_target, conf_target
132 | 
133 |     def random_flip(self, img, boxes):
134 |         '''Randomly flip the image and adjust the bbox locations.
135 |         For bbox (xmin, ymin, xmax, ymax), the flipped bbox is:
136 |         (w-xmax, ymin, w-xmin, ymax).
137 |         Args:
138 |           img: (ndarray.Image) image. f
139 |           boxes: (tensor) bbox locations, sized [#obj, 4].
140 |         Returns:
141 |           img: (ndarray.Image) randomly flipped image.
142 |           boxes: (tensor) randomly flipped bbox locations, sized [#obj, 4].
143 |         '''
144 |         if random.random() < 0.5:
145 |             img = cv2.flip(img, 1)
146 |             w = img.shape[1]
147 |             xmin = w - boxes[:,2]
148 |             xmax = w - boxes[:,0]
149 |             boxes[:,0] = xmin
150 |             boxes[:,2] = xmax
151 |         return img, boxes
152 | 
153 |     def random_crop(self, img, boxes, labels):
154 |         '''Randomly crop the image and adjust the bbox locations.
155 |         For more details, see 'Chapter2.2: Data augmentation' of the paper.
156 |         Args:
157 |           img: (ndarray.Image) image.
158 |           boxes: (tensor) bbox locations, sized [#obj, 4].
159 |           labels: (tensor) bbox labels, sized [#obj,].
160 |         Returns:
161 |           img: (ndarray.Image) cropped image.
162 |           selected_boxes: (tensor) selected bbox locations.
163 |           labels: (tensor) selected bbox labels.
164 |         '''
165 |         imw, imh = img.shape[1], img.shape[0]
166 |         while True:
167 |             min_iou = random.choice([None, 0.1, 0.3, 0.5, 0.7, 0.9])# random choice the one 
168 |             if min_iou is None:
169 |                 return img, boxes, labels
170 | 
171 |             for _ in range(100):
172 |                 w = random.randrange(int(0.1*imw), imw)
173 |                 h = random.randrange(int(0.1*imh), imh)
174 | 
175 |                 if h > 2*w or w > 2*h or h < 1 or w < 1:
176 |                     continue
177 | 
178 |                 x = random.randrange(imw - w)
179 |                 y = random.randrange(imh - h)
180 |                 roi = torch.Tensor([[x, y, x+w, y+h]])
181 |                 
182 |                 center = (boxes[:,:2] + boxes[:,2:]) / 2  # [N,2]
183 |                 roi2 = roi.expand(len(center), 4)  # [N,4]
184 |     
185 |                 mask = (center > roi2[:,:2]) & (center < roi2[:,2:])  # [N,2]
186 |                 mask = mask[:,0] & mask[:,1]  #[N,]
187 | 
188 |                 if not mask.any():
189 |                     continue
190 |               
191 |                 selected_boxes = boxes.index_select(0, mask.nonzero().squeeze(1))
192 |                 
193 |                 iou = self.data_encoder.iou(selected_boxes, roi)
194 |                 if iou.min() < min_iou:
195 |                     continue
196 |                 img = img[y:y+h, x:x+w, :]
197 |                 
198 |                 selected_boxes[:,0].add_(-x).clamp_(min=0, max=w)
199 |                 selected_boxes[:,1].add_(-y).clamp_(min=0, max=h)
200 |                 selected_boxes[:,2].add_(-x).clamp_(min=0, max=w)
201 |                 selected_boxes[:,3].add_(-y).clamp_(min=0, max=h)
202 | 
203 |                 return img, selected_boxes, labels[mask]
204 | 
205 |     def __len__(self):
206 |         return self.num_samples


--------------------------------------------------------------------------------
/MDSSD_with_self_attention/scripts/encoder.py:
--------------------------------------------------------------------------------
  1 | '''Encode target locations and labels.'''
  2 | import torch
  3 | 
  4 | import math
  5 | import itertools
  6 | 
  7 | class DataEncoder:
  8 |     def __init__(self):
  9 |         '''Compute default box sizes with scale and aspect transform.'''
 10 |         scale = 300.
 11 |         steps = [s / scale for s in (32, 64, 100, 300, 4, 8, 16)]
 12 |         sizes_ssd = [s / scale for s in (111, 162, 213, 264, 315)]
 13 |         sizes_fusion = [s / scale for s in (15, 30, 60, 111)]
 14 |         aspect_ratios = ((2,3), (2,3), (2,), (2,), (2,), (2,), (2,))
 15 |         feature_map_sizes = (10, 5, 3, 1, 75, 38, 19)
 16 |         num_layers = len(feature_map_sizes)
 17 | 
 18 |         boxes = []
 19 |         for i in range(num_layers):
 20 |             fmsize = feature_map_sizes[i] # feature map size 
 21 |             for h,w in itertools.product(range(fmsize), repeat=2):
 22 |                 cx = (w + 0.5)*steps[i]
 23 |                 cy = (h + 0.5)*steps[i]
 24 | 
 25 |                 if i<4:
 26 |                     s = sizes_ssd[i]
 27 |                     boxes.append((cx, cy, s, s))
 28 | 
 29 |                     s = math.sqrt(sizes_ssd[i] * sizes_ssd[i+1])
 30 |                     boxes.append((cx, cy, s, s))
 31 | 
 32 |                     s = sizes_ssd[i]
 33 |                     for ar in aspect_ratios[i]:
 34 |                         boxes.append((cx, cy, s * math.sqrt(ar), s / math.sqrt(ar)))
 35 |                         boxes.append((cx, cy, s / math.sqrt(ar), s * math.sqrt(ar)))
 36 | 
 37 |                 else:            
 38 |                     s = sizes_fusion[i-4]
 39 |                     boxes.append((cx, cy, s, s))
 40 | 
 41 |                     s = math.sqrt(sizes_fusion[i-4] * sizes_fusion[i-4+1])
 42 |                     boxes.append((cx, cy, s, s))
 43 | 
 44 |                     s = sizes_fusion[i-4]
 45 |                     for ar in aspect_ratios[i]:
 46 |                         boxes.append((cx, cy, s * math.sqrt(ar), s / math.sqrt(ar)))
 47 |                         boxes.append((cx, cy, s / math.sqrt(ar), s * math.sqrt(ar)))
 48 | 
 49 |         self.default_boxes = torch.Tensor(boxes)
 50 | 
 51 |     def iou(self, box1, box2):
 52 |         '''Compute the intersection over union of two set of boxes, each box is [x1,y1,x2,y2].
 53 | 
 54 |         Args:
 55 |           box1: (tensor) bounding boxes, sized [N,4].
 56 |           box2: (tensor) bounding boxes, sized [M,4].
 57 | 
 58 |         Return:
 59 |           (tensor) iou, sized [N,M].
 60 |         '''
 61 |         N = box1.size(0)
 62 |         M = box2.size(0)
 63 | 
 64 |         lt = torch.max(
 65 |             box1[:,:2].unsqueeze(1).expand(N,M,2),  # [N,2] -> [N,1,2] -> [N,M,2]
 66 |             box2[:,:2].unsqueeze(0).expand(N,M,2),  # [M,2] -> [1,M,2] -> [N,M,2]
 67 |         )
 68 | 
 69 |         rb = torch.min(
 70 |             box1[:,2:].unsqueeze(1).expand(N,M,2),  # [N,2] -> [N,1,2] -> [N,M,2]
 71 |             box2[:,2:].unsqueeze(0).expand(N,M,2),  # [M,2] -> [1,M,2] -> [N,M,2]
 72 |         )
 73 | 
 74 |         wh = rb - lt  # [N,M,2]
 75 |         wh[wh<0] = 0  # clip at 0
 76 |         inter = wh[:,:,0] * wh[:,:,1]  # [N,M]
 77 | 
 78 |         area1 = (box1[:,2]-box1[:,0]) * (box1[:,3]-box1[:,1])  # [N,]
 79 |         area2 = (box2[:,2]-box2[:,0]) * (box2[:,3]-box2[:,1])  # [M,]
 80 |         area1 = area1.unsqueeze(1).expand_as(inter)  # [N,] -> [N,1] -> [N,M]
 81 |         area2 = area2.unsqueeze(0).expand_as(inter)  # [M,] -> [1,M] -> [N,M]
 82 | 
 83 |         iou = inter / (area1 + area2 - inter)
 84 |         return iou
 85 | 
 86 |     def encode(self, boxes, classes, threshold=0.5):
 87 |         '''Transform target bounding boxes and class labels to SSD boxes and classes.
 88 | 
 89 |         Match each object box to all the default boxes, pick the ones with the
 90 |         Jaccard-Index > 0.5:
 91 |             Jaccard(A,B) = AB / (A+B-AB)
 92 | 
 93 |         Args:
 94 |           boxes: (tensor) object bounding boxes (xmin,ymin,xmax,ymax) of a image, sized [#obj, 4].
 95 |           classes: (tensor) object class labels of a image, sized [#obj,].
 96 |           threshold: (float) Jaccard index threshold
 97 | 
 98 |         Returns:
 99 |           boxes: (tensor) bounding boxes, sized [#obj, 8732, 4].
100 |           classes: (tensor) class labels, sized [8732,]
101 |         '''
102 |         default_boxes = self.default_boxes
103 |         num_default_boxes = default_boxes.size(0)
104 |         num_objs = boxes.size(0)
105 | 
106 |         iou = self.iou(  # [#obj,8732]
107 |             boxes,
108 |             torch.cat([default_boxes[:,:2] - default_boxes[:,2:]/2,
109 |                        default_boxes[:,:2] + default_boxes[:,2:]/2], 1)
110 |         )
111 | 
112 |         iou, max_idx = iou.max(0)  # [1,8732]
113 |         max_idx.squeeze_(0)        # [8732,]
114 |         iou.squeeze_(0)            # [8732,]
115 | 
116 |         boxes = boxes[max_idx]     # [8732,4]
117 |         variances = [0.1, 0.2]
118 |         cxcy = (boxes[:,:2] + boxes[:,2:])/2 - default_boxes[:,:2]  # [8732,2]
119 |         cxcy /= variances[0] * default_boxes[:,2:]
120 |         wh = (boxes[:,2:] - boxes[:,:2]) / default_boxes[:,2:]      # [8732,2]
121 |         wh = torch.log(wh) / variances[1]
122 |         loc = torch.cat([cxcy, wh], 1)  # [8732,4]
123 | 
124 |         conf = 1 + classes[max_idx]   # [8732,], background class = 0
125 |         conf[iou<threshold] = 0       # background
126 |         return loc, conf
127 | 
128 |     def nms(self, bboxes, scores, threshold=0.3, mode='union'):
129 |         '''Non maximum suppression.
130 |         Args:
131 |           bboxes: (tensor) bounding boxes, sized [N,4].
132 |           scores: (tensor) bbox scores, sized [N,].
133 |           threshold: (float) overlap threshold.
134 |           mode: (str) 'union' or 'min'.
135 | 
136 |         Returns:
137 |           keep: (tensor) selected indices.
138 | 
139 |         Ref:
140 |           https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py
141 |         '''
142 |         x1 = bboxes[:,0]
143 |         y1 = bboxes[:,1]
144 |         x2 = bboxes[:,2]
145 |         y2 = bboxes[:,3]
146 | 
147 |         areas = (x2-x1) * (y2-y1)
148 |         _, order = scores.sort(0, descending=True)
149 |         order = order.numpy()
150 | 
151 |         keep = []
152 |         while order.size > 0: 
153 |             try:
154 |                 i = order[0]
155 |             except:
156 |                 i = order
157 |             keep.append(i)
158 | 
159 |             if order.size == 1:
160 |                 break
161 | 
162 |             xx1 = x1[order[1:]].clamp(min=x1[i])
163 |             yy1 = y1[order[1:]].clamp(min=y1[i])
164 |             xx2 = x2[order[1:]].clamp(max=x2[i])
165 |             yy2 = y2[order[1:]].clamp(max=y2[i])
166 | 
167 |             w = (xx2-xx1).clamp(min=0)
168 |             h = (yy2-yy1).clamp(min=0)
169 |             inter = w*h
170 | 
171 |             if mode == 'union':
172 |                 ovr = inter / (areas[i] + areas[order[1:]] - inter)
173 |             elif mode == 'min':
174 |                 ovr = inter / areas[order[1:]].clamp(max=areas[i])
175 |             else:
176 |                 raise TypeError('Unknown nms mode: %s.' % mode)
177 | 
178 |             ids = (ovr<=threshold).nonzero().squeeze()
179 |             if ids.size == 0:
180 |                 break
181 |             order = order[ids+1]
182 |         return torch.LongTensor(keep)
183 | 
184 |     def decode(self, loc, conf):
185 |         '''Transform predicted loc/conf back to real bbox locations and class labels.
186 | 
187 |         Args:
188 |           loc: (tensor) predicted loc, sized [8732,4].
189 |           conf: (tensor) predicted conf, sized [8732,21].
190 | 
191 |         Returns:
192 |           boxes: (tensor) bbox locations, sized [#obj, 4].
193 |           labels: (tensor) class labels, sized [#obj,1].
194 |         '''
195 |         variances = (0.1, 0.2)
196 |         wh = torch.exp(loc[:,2:]*variances[1]) * self.default_boxes[:,2:]
197 |         cxcy = loc[:,:2] * variances[0] * self.default_boxes[:,2:] + self.default_boxes[:,:2]
198 |         box_preds = torch.cat([cxcy-wh/2, cxcy+wh/2], 1)  # [8732,4]
199 | 
200 |         boxes = []
201 |         labels = []
202 |         scores = []
203 |         num_classes = conf.size(1)
204 |         for i in range(num_classes-1):
205 |             score = conf[:,i+1]  # class i corresponds to (i+1) column
206 |             mask = score > 0.1
207 |             
208 |             if not mask.any():
209 |                 continue
210 |            
211 |             box = box_preds[mask.nonzero().squeeze()]
212 |             score = score[mask]
213 |             
214 |             if len(score) == 1:
215 |                 continue
216 |             keep = self.nms(box, score, threshold=0.3)
217 |             boxes.append(box[keep])
218 |             labels.append(torch.LongTensor(len(box[keep])).fill_(i))
219 |             scores.append(score[keep])
220 | 
221 |         return boxes, labels, scores
222 | 


--------------------------------------------------------------------------------
/MDSSD_with_self_attention/scripts/fusion.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import torch.nn.init as init
 5 | 
 6 | from torch.autograd import Variable
 7 | 
 8 | from norm import L2Norm
 9 | 
10 | class FusionBlock(nn.Module):
11 |     def __init__(self, big_features, small_features):
12 |         super(FusionBlock, self).__init__()
13 |         
14 |         # Bigger feature map
15 |         self.conv1_1 = nn.Conv2d(big_features, 256, kernel_size=3, padding=1, dilation=1)
16 |         self.Norm1 = L2Norm(256, 20)
17 | 
18 |         # Smaller feature map
19 |         self.deconv2_1 = nn.ConvTranspose2d(small_features, 256, 2, stride=2, dilation=1)
20 |         self.conv2_1 = nn.Conv2d(256, 256, kernel_size=3, padding=1, dilation=1)
21 |         self.bn2_1 = nn.BatchNorm2d(256)
22 |         self.deconv2_2 = nn.ConvTranspose2d(256, 256, 2, stride=2, dilation=1)
23 |         self.conv2_2 = nn.Conv2d(256, 256, kernel_size=3, padding=1, dilation=1)
24 |         self.bn2_2 = nn.BatchNorm2d(256)
25 |         self.deconv2_3 = nn.ConvTranspose2d(256, 256, 3, stride=2, dilation=1)
26 |         self.conv2_3 = nn.Conv2d(256, 256, kernel_size=3, padding=1, dilation=1)
27 |         self.Norm2 = L2Norm(256, 20)
28 | 
29 |         # Common
30 |         self.conv3_1 = nn.Conv2d(256, big_features, kernel_size=3, padding=1, dilation=1)
31 | 
32 |         
33 |     def forward(self, big, small):
34 |         h1 = self.conv1_1(big)
35 |         h1 = self.Norm1(h1)
36 | 
37 |         h2 = self.deconv2_1(small)
38 |         # print(h2.size())
39 |         h2 = F.relu(self.bn2_1(self.conv2_1(h2)))
40 |         # print(h2.size())
41 |         h2 = self.deconv2_2(h2)
42 |         # print(h2.size())
43 |         h2 = F.relu(self.bn2_2(self.conv2_2(h2)))
44 |         # print(h2.size())
45 |         h2 = self.deconv2_3(h2)
46 |         # print(h2.size())
47 |         h2 = self.conv2_3(h2)
48 |         # print(h2.size())
49 |         h2 = self.Norm2(h2)
50 | 
51 |         size = h2.size()[3]
52 |         diff_odd = h2.size()[-1] - h1.size()[-1]
53 |         h2 = h2[:,:,(int(diff_odd/2)+diff_odd%2):(size-int(diff_odd/2)),(int(diff_odd/2)+diff_odd%2):(size-int(diff_odd/2))]
54 | 
55 |         # print(h1.size(), h2.size())
56 |         h = F.relu(h1+h2)
57 |         h = F.relu(self.conv3_1(h))
58 | 
59 |         return h 
60 | 
61 | if __name__ == '__main__':
62 |     big = torch.randn(1, 256, 128, 128)
63 |     small = torch.rand(1,512,16,16)
64 |     net = FusionBlock(256,512)
65 | 
66 |                
67 |         
68 | 


--------------------------------------------------------------------------------
/MDSSD_with_self_attention/scripts/gen_test_file.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torchvision
  3 | import torch.nn.functional as F
  4 | import torchvision.transforms as transforms
  5 | 
  6 | import sys
  7 | from mdssd import MDSSD300
  8 | from encoder import DataEncoder
  9 | import cv2
 10 | 
 11 | import pandas as pd
 12 | import shutil
 13 | import os
 14 | import numpy as np
 15 | import glob
 16 | import xml.etree.ElementTree as ET
 17 | from xml.dom import minidom
 18 | 
 19 | TEST_DIR = '/home/siddhant/deeplearning/Dataset/VisDrone2019/VisDrone2019-DET-val/images/'
 20 | TEST_ANNOT = '/home/siddhant/deeplearning/Dataset/VisDrone2019/VisDrone2019-DET-val/annotations/'
 21 | 
 22 | LABELS = (
 23 | 	'ignored regions',
 24 | 	'pedestrian',
 25 | 	'people',
 26 | 	'bicycle',
 27 | 	'car',
 28 | 	'van',
 29 | 	'truck',
 30 | 	'tricycle',
 31 | 	'awning-tricycle',
 32 | 	'bus',
 33 | 	'motor',
 34 | 	'other'
 35 | )
 36 | 
 37 | def GT(annotation_file):
 38 | 	# Load model
 39 | 	net = MDSSD300()
 40 | 	checkpoint = torch.load('./checkpoint/ckpt.pth')
 41 | 
 42 | 	keys = []
 43 | 	for k,v in checkpoint['net'].items():
 44 | 		if "module" in k:
 45 | 			keys.append(k)
 46 | 	for i in keys:
 47 | 		checkpoint['net'][i[7:]] = checkpoint['net'][i]
 48 | 		del checkpoint['net'][i]
 49 | 
 50 | 	net.load_state_dict(checkpoint['net'])
 51 | 	net.eval()
 52 | 
 53 | 	count = 0
 54 | 	for i in os.listdir(annotation_file):
 55 | 		count += 1
 56 | 		print(count)
 57 | 		with open(os.path.join(annotation_file,i)) as f:
 58 | 			f = f.read().split("\n")
 59 | 			f = f[:-1]
 60 | 		num_objs = len(f)
 61 | 
 62 | 		file = open(os.path.join("../test/gt/",i[:-4]+".txt"), "w")
 63 | 
 64 | 		for j in range(num_objs):
 65 | 			f[j] = f[j].split(",")
 66 | 			label = int(f[j][5])
 67 | 			if label == 0:
 68 | 				continue
 69 | 			xmin = float(f[j][0])
 70 | 			ymin = float(f[j][1])
 71 | 			w = float(f[j][2])
 72 | 			h = float(f[j][3])
 73 | 			file.write(str(LABELS[label])+"	"+str(int(xmin))+"	"+str(int(ymin))+"	"+str(int(xmin+w))+"	"+str(int(ymin+h))+"\n")
 74 | 		file.close()
 75 | 
 76 | def detect(image_dir):
 77 | 	# Load model
 78 | 	net = MDSSD300()
 79 | 	checkpoint = torch.load('./checkpoint/ckpt.pth')
 80 | 
 81 | 	keys = []
 82 | 	for k,v in checkpoint['net'].items():
 83 | 		if "module" in k:
 84 | 			keys.append(k)
 85 | 	for i in keys:
 86 | 		checkpoint['net'][i[7:]] = checkpoint['net'][i]
 87 | 		del checkpoint['net'][i]
 88 | 
 89 | 	net.load_state_dict(checkpoint['net'])
 90 | 	net.eval()
 91 | 
 92 | 	count = 0
 93 | 	for i in os.listdir(image_dir):
 94 | 		count += 1
 95 | 		print(count)
 96 | 		file = open("../test/detect/"+i[:-4]+".txt","w")
 97 | 		img = cv2.imread(os.path.join(image_dir,i))
 98 | 
 99 | 		img1 = cv2.resize(img, (300, 300))
100 | 		transform = transforms.Compose([transforms.ToTensor(),
101 | 										transforms.Normalize(mean=(0.356, 0.368, 0.362), std=(0.242, 0.235, 0.236))])
102 | 		img1 = transform(img1)
103 | 
104 | 		# Forward
105 | 		with torch.no_grad():
106 | 			x = torch.tensor(img1)
107 | 			loc_preds, conf = net(x.unsqueeze(0))
108 | 		# Decode
109 | 		data_encoder = DataEncoder()
110 | 		boxes, labels, scores = data_encoder.decode(loc_preds.data.squeeze(0), F.softmax(conf.squeeze(0), dim=1).data)
111 | 
112 | 		for box, label, score in zip(boxes, labels, scores):
113 | 			for b, l, s in zip(box, label, score):
114 | 				# print(b,l,s)
115 | 				if l.item() == 0:
116 | 					continue
117 | 				b[::2] *= img.shape[1]
118 | 				b[1::2] *= img.shape[0]
119 | 
120 | 				xmin = str(int(b[0].item()))
121 | 				ymin = str(int(b[1].item()))
122 | 				xmax = str(int(b[2].item()))
123 | 				ymax = str(int(b[3].item()))
124 | 				confidence = str(s.item())
125 | 				label = str(LABELS[int(l.item())])
126 | 				file.write(label+"	"+confidence+"	"+xmin+"	"+ymin+"	"+xmax+"	"+ymax+"\n")
127 | 
128 | 		file.close()
129 | 
130 | 
131 | if __name__ == "__main__":
132 | 	GT(TEST_ANNOT)
133 | 	detect(TEST_DIR)
134 | 


--------------------------------------------------------------------------------
/MDSSD_with_self_attention/scripts/mdssd.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import itertools
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | import torch.nn.init as init
  8 | 
  9 | from torch.autograd import Variable
 10 | 
 11 | from multibox_layer import MultiBoxLayer
 12 | from fusion import FusionBlock
 13 | from norm import L2Norm
 14 | from attention import AttentionBlock
 15 | 
 16 | class MDSSD300(nn.Module):
 17 | 	input_size = 300
 18 | 
 19 | 	def __init__(self):
 20 | 		super(MDSSD300, self).__init__()
 21 | 		
 22 | 		# Attention
 23 | 		self.attn1 = AttentionBlock(3)
 24 | 
 25 | 		# model
 26 | 		self.base = self.VGG16()
 27 | 		self.norm4 = L2Norm(512, 20) # 38
 28 | 
 29 | 		self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1)
 30 | 		self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1)
 31 | 		self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1)
 32 | 		self.bn5 = nn.BatchNorm2d(512)
 33 | 		
 34 | 		self.conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
 35 | 		self.bn6 = nn.BatchNorm2d(1024)
 36 | 
 37 | 		self.conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 
 38 | 		self.bn7 = nn.BatchNorm2d(1024)
 39 | 		# self.attn2 = AttentionBlock(1024)
 40 | 
 41 | 		self.conv8_1 = nn.Conv2d(1024, 256, kernel_size=1)
 42 | 		self.conv8_2 = nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2)
 43 | 		self.bn8 = nn.BatchNorm2d(512)
 44 | 
 45 | 		self.conv9_1 = nn.Conv2d(512, 128, kernel_size=1)
 46 | 		self.conv9_2 = nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2)
 47 | 		self.bn9 = nn.BatchNorm2d(256)
 48 | 
 49 | 		self.conv10_1 = nn.Conv2d(256, 128, kernel_size=1)
 50 | 		self.conv10_2 = nn.Conv2d(128, 256, kernel_size=3)
 51 | 
 52 | 		self.conv11_1 = nn.Conv2d(256, 128, kernel_size=1)
 53 | 		self.conv11_2 = nn.Conv2d(128, 256, kernel_size=3)
 54 | 
 55 | 		self.Fusion1 = FusionBlock(256,512)
 56 | 		self.Fusion2 = FusionBlock(512,256)
 57 | 		self.Fusion3 = FusionBlock(1024,256)
 58 | 
 59 | 		# multibox layer
 60 | 		self.multibox = MultiBoxLayer()
 61 | 
 62 | 	def forward(self, x):
 63 | 		odd_count = 0
 64 | 		odd = []
 65 | 		hs = []
 66 | 		vgg = []
 67 | 		fusion_layers = []
 68 | 
 69 | 		h= self.attn1(x)
 70 | 
 71 | 		h = self.base[0](h)
 72 | 		vgg.append(h)
 73 | 		for i in range(1,len(self.base)):
 74 | 			h = self.base[i](h)
 75 | 			vgg.append(h)
 76 | 		fusion_layers.append(vgg[15])
 77 | 		odd.append(2)
 78 | 		odd_count = 3
 79 | 		fusion_layers.append(h)
 80 | 		h = F.max_pool2d(h, kernel_size=2, stride=2, ceil_mode=True)
 81 | 
 82 | 		h = F.relu(self.conv5_1(h))
 83 | 		h = F.relu(self.conv5_2(h))
 84 | 		h = F.relu(self.conv5_3(h))
 85 | 		h = F.max_pool2d(self.bn5(h), kernel_size=3, padding=1, stride=1, ceil_mode=True)
 86 | 		
 87 | 		h = F.relu(self.bn6(self.conv6(h)))
 88 | 		h = F.relu(self.bn7(self.conv7(h)))
 89 | 		fusion_layers.append(h)
 90 | 		# h = self.attn2(h)
 91 | 
 92 | 		h = F.relu(self.conv8_1(h))
 93 | 		h = F.relu(self.bn8(self.conv8_2(h)))
 94 | 		hs.append(h)  # conv8_2
 95 | 
 96 | 		h = F.relu(self.conv9_1(h))
 97 | 		h = F.relu(self.bn9(self.conv9_2(h)))
 98 | 		hs.append(h)  # conv9_2
 99 | 
100 | 		h = F.relu(self.conv10_1(h))
101 | 		h = F.relu(self.conv10_2(h))
102 | 		hs.append(h)  # conv10_2
103 | 
104 | 		h = F.relu(self.conv11_1(h))
105 | 		h = F.relu(self.conv11_2(h))
106 | 		hs.append(h)  # conv11_2
107 | 
108 | 		# Fusion Blocks
109 | 		f = self.Fusion1(fusion_layers[0],hs[-4])
110 | 		hs.append(f)
111 | 		f = self.Fusion2(fusion_layers[1],hs[-4])
112 | 		hs.append(f)
113 | 		diff_odd = fusion_layers[2].size()[-1] - hs[-4].size()[-1]
114 | 		f = self.Fusion3(fusion_layers[2],hs[-4])
115 | 		hs.append(f)
116 | 
117 | 		loc_preds, conf_preds = self.multibox(hs)
118 |  
119 | 		return loc_preds, conf_preds
120 | 
121 | 	def VGG16(self):
122 | 		'''VGG16 layers.'''
123 | 		cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512]
124 | 		layers = []
125 | 		in_channels = 3
126 | 		for x in cfg:
127 | 			if x == 'M':
128 | 				layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
129 | 			else:
130 | 				layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
131 | 						   nn.ReLU(True)]
132 | 				in_channels = x
133 | 		return nn.Sequential(*layers)
134 | 
135 | if __name__ == '__main__':
136 | 	t = torch.randn(1, 3, 300, 300)
137 | 	net = MDSSD300()
138 | 	# print(net)
139 | 	res = net.forward(t)
140 | 	


--------------------------------------------------------------------------------
/MDSSD_with_self_attention/scripts/multibox_layer.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import math
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.init as init
 8 | import torch.nn.functional as F
 9 | 
10 | 
11 | class MultiBoxLayer(nn.Module):
12 | 	num_classes = 13
13 | 	num_anchors = [6,6,4,4,4,4,4]
14 | 	
15 | 	def __init__(self):
16 | 		super(MultiBoxLayer, self).__init__()
17 | 		self.in_planes = [512,256,256,256,256,512,1024]
18 | 		self.loc_layers = nn.ModuleList()
19 | 		self.conf_layers = nn.ModuleList()
20 | 		for i in range(len(self.in_planes)):
21 | 			self.loc_layers.append(nn.Conv2d(self.in_planes[i], self.num_anchors[i]*4, kernel_size=3, padding=1))
22 | 			self.conf_layers.append(nn.Conv2d(self.in_planes[i], self.num_anchors[i]*13, kernel_size=3, padding=1))
23 | 
24 | 	def forward(self, xs):
25 | 		'''
26 | 		Args:
27 | 		  xs: (list) of tensor containing intermediate layer outputs.
28 | 
29 | 		Returns:
30 | 		  loc_preds: (tensor) predicted locations, sized [N,8732,4].
31 | 		  conf_preds: (tensor) predicted class confidences, sized [N,8732,21].
32 | 		'''
33 | 		y_locs = []
34 | 		y_confs = []
35 | 		for i, x in enumerate(xs):
36 | 			y_loc = self.loc_layers[i](x)
37 | 			N = y_loc.size(0)
38 | 			y_loc = y_loc.permute(0,2,3,1).contiguous()
39 | 			y_loc = y_loc.view(N,-1,4)
40 | 			y_locs.append(y_loc)
41 | 
42 | 			y_conf = self.conf_layers[i](x)
43 | 			y_conf = y_conf.permute(0,2,3,1).contiguous()
44 | 			y_conf = y_conf.view(N,-1,13)
45 | 			y_confs.append(y_conf)
46 | 
47 | 		loc_preds = torch.cat(y_locs, 1)
48 | 		conf_preds = torch.cat(y_confs, 1)
49 | 		return loc_preds, conf_preds
50 | 


--------------------------------------------------------------------------------
/MDSSD_with_self_attention/scripts/multibox_loss.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import math
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.init as init
  8 | import torch.nn.functional as F
  9 | 
 10 | 
 11 | class MultiBoxLoss(nn.Module):
 12 |     num_classes = 13
 13 | 
 14 |     def __init__(self):
 15 |         super(MultiBoxLoss, self).__init__()
 16 | 
 17 |     def cross_entropy_loss(self, x, y):
 18 |         '''Cross entropy loss w/o averaging across all samples.
 19 | 
 20 |         Args:
 21 |           x: (tensor) sized [N,D].
 22 |           y: (tensor) sized [N,].
 23 | 
 24 |         Return:
 25 |           (tensor) cross entroy loss, sized [N,].
 26 |         '''
 27 |         xmax = x.data.max()
 28 |         print('x y size {} {}'.format(x.size(), y.size()))
 29 |         log_sum_exp = torch.log(torch.sum(torch.exp(x-xmax), 1)) + xmax
 30 |         print('log_sum_exp {}'.format(log_sum_exp.size()))
 31 |         return log_sum_exp - x.gather(1, y.view(-1,1))
 32 | 
 33 |     def hard_negative_mining(self, conf_loss, pos):
 34 |         '''Return negative indices that is 3x the number as postive indices.
 35 | 
 36 |         Args:
 37 |           conf_loss: (tensor) cross entroy loss between conf_preds and conf_targets, sized [N*8732,].
 38 |           pos: (tensor) positive(matched) box indices, sized [N,8732].
 39 | 
 40 |         Return:
 41 |           (tensor) negative indices, sized [N,8732].
 42 |         '''
 43 |         batch_size, num_boxes = pos.size()
 44 |         conf_loss[pos.view(-1)] = 0  # set pos boxes = 0, the rest are neg conf_loss
 45 |         conf_loss = conf_loss.view(batch_size, -1)  # [N,8732]
 46 | 
 47 |         _,idx = conf_loss.sort(1, descending=True)  # sort by neg conf_loss
 48 |         _,rank = idx.sort(1)  # [N,8732]
 49 | 
 50 |         num_pos = pos.long().sum(1)  # [N,1]
 51 |         num_neg = torch.clamp(3*num_pos, max=num_boxes-1)  # [N,1]
 52 | 
 53 |         neg = rank < num_neg.unsqueeze(1).expand_as(rank)
 54 | 
 55 |         return neg
 56 | 
 57 |     def forward(self, loc_preds, loc_targets, conf_preds, conf_targets):
 58 |         '''Compute loss between (loc_preds, loc_targets) and (conf_preds, conf_targets).
 59 | 
 60 |         Args:
 61 |           loc_preds: (tensor) predicted locations, sized [batch_size, 8732, 4].
 62 |           loc_targets: (tensor) encoded target locations, sized [batch_size, 8732, 4].
 63 |           conf_preds: (tensor) predicted class confidences, sized [batch_size, 8732, num_classes].
 64 |           conf_targets: (tensor) encoded target classes, sized [batch_size, 8732].
 65 | 
 66 |         loss:
 67 |           (tensor) loss = SmoothL1Loss(loc_preds, loc_targets) + CrossEntropyLoss(conf_preds, conf_targets).
 68 |         '''
 69 | 
 70 |         # loc_preds = loc_preds[:,:8732,:]
 71 |         # conf_preds = conf_preds[:,:8732,:]
 72 | 
 73 |         batch_size, num_boxes, _ = loc_preds.size()
 74 |         pos = conf_targets > 0  # [N,8732], pos means the box matched.
 75 |         # print(pos.size())
 76 |         num_matched_boxes = pos.data.float().sum()
 77 |         if num_matched_boxes == 0:
 78 |             return torch.tensor([0.], requires_grad=True)
 79 |         
 80 |         ################################################################
 81 |         # loc_loss = SmoothL1Loss(pos_loc_preds, pos_loc_targets)
 82 |         ################################################################
 83 |         pos_mask = pos.unsqueeze(2).expand_as(loc_preds)    # [N,8732,4]
 84 |         pos_loc_preds = loc_preds[pos_mask].view(-1,4)      # [#pos,4]
 85 |         pos_loc_targets = loc_targets[pos_mask].view(-1,4)  # [#pos,4]
 86 |         
 87 |         loc_loss = F.smooth_l1_loss(pos_loc_preds, pos_loc_targets, size_average=False)
 88 | 
 89 |         ################################################################
 90 |         # conf_loss = CrossEntropyLoss(pos_conf_preds, pos_conf_targets)
 91 |         #           + CrossEntropyLoss(neg_conf_preds, neg_conf_targets)
 92 |         ################################################################
 93 |         conf_preds = conf_preds.contiguous()
 94 |         # print(conf_preds.size(), conf_targets.size())
 95 |         conf_loss = F.cross_entropy(conf_preds.view(-1,self.num_classes), \
 96 |                                             conf_targets.view(-1), reduce=False)  # [N*8732,]
 97 |         neg = self.hard_negative_mining(conf_loss, pos)    # [N,8732]
 98 | 
 99 |         pos_mask = pos.unsqueeze(2).expand_as(conf_preds)  # [N,8732,21]
100 |         neg_mask = neg.unsqueeze(2).expand_as(conf_preds)  # [N,8732,21]
101 |         mask = (pos_mask+neg_mask).gt(0)
102 | 
103 |         pos_and_neg = (pos+neg).gt(0)
104 |         preds = conf_preds[mask].view(-1,self.num_classes)  # [#pos+#neg,21]
105 |         targets = conf_targets[pos_and_neg]                 # [#pos+#neg,]
106 |         conf_loss = F.cross_entropy(preds, targets, size_average=False)
107 | 
108 |         loc_loss /= num_matched_boxes
109 |         conf_loss /= num_matched_boxes
110 | 
111 |         return loc_loss + conf_loss
112 | 


--------------------------------------------------------------------------------
/MDSSD_with_self_attention/scripts/norm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import torch.nn.init as init
 5 | 
 6 | class L2Norm(nn.Module):
 7 |     '''L2Norm layer across all channels and scale.'''
 8 |     def __init__(self, in_features,scale):
 9 |         super(L2Norm, self).__init__()
10 |         self.weight = nn.Parameter(torch.Tensor(in_features))
11 |         self.reset_parameters(scale)
12 | 
13 |     def reset_parameters(self, scale):
14 |         nn.init.constant_(self.weight, scale)
15 | 
16 |     def forward(self, x):
17 |         x = F.normalize(x, dim=1)
18 |         scale = self.weight[None,:,None,None]
19 |         return scale * x


--------------------------------------------------------------------------------
/MDSSD_with_self_attention/scripts/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchvision
 3 | import torch.nn.functional as F
 4 | import torchvision.transforms as transforms
 5 | 
 6 | import sys
 7 | from mdssd import MDSSD300
 8 | from encoder import DataEncoder
 9 | import cv2
10 | 
11 | VOC_LABELS = (
12 |     'ignored regions',
13 |     'pedestrian',
14 |     'people',
15 |     'bicycle',
16 |     'car',
17 |     'van',
18 |     'truck',
19 |     'tricycle',
20 |     'awning-tricycle',
21 |     'bus',
22 |     'motor',
23 |     'other'
24 | )
25 | 
26 | 
27 | # Load model
28 | net = MDSSD300()
29 | checkpoint = torch.load('./checkpoint/ckpt.pth')
30 | 
31 | keys = []
32 | for k,v in checkpoint['net'].items():
33 |     if "module" in k:
34 |         keys.append(k)
35 | for i in keys:
36 |     checkpoint['net'][i[7:]] = checkpoint['net'][i]
37 |     del checkpoint['net'][i]
38 | 
39 | net.load_state_dict(checkpoint['net'])
40 | net.eval()
41 | 
42 | if len(sys.argv) == 2:
43 |     img_path = sys.argv[1]
44 | else:
45 |     img_path = './images/img7.jpg'
46 | 
47 | # Load test image
48 | img = cv2.imread(img_path)
49 | img1 = cv2.resize(img, (300, 300))
50 | transform = transforms.Compose([transforms.ToTensor(),
51 |                                 transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))])
52 | img1 = transform(img1)
53 | 
54 | # Forward
55 | with torch.no_grad():
56 |     x = torch.tensor(img1)
57 |     loc_preds, conf = net(x.unsqueeze(0))
58 | # Decode
59 | data_encoder = DataEncoder()
60 | boxes, labels, scores = data_encoder.decode(loc_preds.data.squeeze(0), F.softmax(conf.squeeze(0), dim=1).data)
61 | for box, label, score in zip(boxes, labels, scores):
62 |     for b, s in zip(box, score):
63 |         if s > 0.5:
64 |             b[::2] *= img.shape[1]
65 |             b[1::2] *= img.shape[0]
66 |             print('label:',VOC_LABELS[int(label[0])], 'score:', score)
67 |             b = list(b)
68 |             cv2.rectangle(img, (b[0], b[1]), (b[2], b[3]), (255, 255, 255), 2)
69 |             title = '{}: {}'.format(VOC_LABELS[int(label[0])], round(float(score[0]), 2))
70 |             cv2.putText(img, title, (b[0], b[1]), cv2.FONT_ITALIC, 0.6, (0, 255, 0), 2)
71 |             cv2.imshow('img', img)
72 | cv2.waitKey(0)


--------------------------------------------------------------------------------
/MDSSD_with_self_attention/scripts/train.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import os
  4 | import argparse
  5 | import itertools
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.optim as optim
 10 | import torch.nn.functional as F
 11 | import torch.backends.cudnn as cudnn
 12 | 
 13 | import torchvision
 14 | import torchvision.transforms as transforms
 15 | 
 16 | import numpy as np
 17 | from mdssd import MDSSD300
 18 | from datagen import ListDataset
 19 | from multibox_loss import MultiBoxLoss
 20 | 
 21 | TRAIN_IMG_DIR = '../../../../VisDrone2019/dataset/VisDrone2018-DET-train/images/'
 22 | TRAIN_ANNOT_DIR = '../../../../VisDrone2019/dataset/VisDrone2018-DET-train/annotations/'
 23 | VAL_IMAGE_DIR = '../../../../VisDrone2019/dataset/VisDrone2019-DET-val/images/'
 24 | VAL_ANNOT_DIR = '../../../../VisDrone2019/dataset/VisDrone2019-DET-val/annotations/'
 25 | 
 26 | lr = 0.001
 27 | resume = True  # Resume from checkpoint
 28 | epoch = 200
 29 | batch_size = 8
 30 | 
 31 | use_cuda = torch.cuda.is_available()
 32 | best_loss = float('inf')  # best test loss
 33 | start_epoch = 0  # start from epoch 0 or last epoch
 34 | 
 35 | # Data
 36 | print('==> Preparing data..')
 37 | transform = transforms.Compose([transforms.ToTensor(),
 38 |                             transforms.Normalize(mean=(0.356, 0.368, 0.362), std=(0.242, 0.235, 0.236))])
 39 | 
 40 | trainset = ListDataset(root=TRAIN_IMG_DIR, list_file=TRAIN_ANNOT_DIR, train=True, transform=transform)
 41 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=8, shuffle=True, num_workers=4)
 42 | 
 43 | valset = ListDataset(root=VAL_IMAGE_DIR, list_file=VAL_ANNOT_DIR, train=True, transform=transform)
 44 | valloader = torch.utils.data.DataLoader(valset, batch_size=8, shuffle=True, num_workers=4)
 45 | 
 46 | # Model
 47 | net = MDSSD300()
 48 | if resume:
 49 |     print('==> Resuming from checkpoint..')
 50 |     checkpoint = torch.load('./checkpoint/ckpt.pth')
 51 | 
 52 |     keys = []
 53 |     for k,v in checkpoint['net'].items():
 54 |         if "module" in k:
 55 |             keys.append(k)
 56 |     for i in keys:
 57 |         checkpoint['net'][i[7:]] = checkpoint['net'][i]
 58 |         del checkpoint['net'][i]
 59 | 
 60 |     net.load_state_dict(checkpoint['net'])
 61 |     best_loss = checkpoint['loss']
 62 |     start_epoch = checkpoint['epoch']
 63 | else:
 64 |     # Convert from pretrained VGG model.
 65 |     try:
 66 |         net.load_state_dict(torch.load('../model/ssd.pth'))
 67 |         print('==> Pretrain model read successfully')
 68 |     except:
 69 |         print('==> Pretrain model read failed or not existed, training from init')
 70 | 
 71 | criterion = MultiBoxLoss()
 72 | 
 73 | if use_cuda:
 74 |     net = torch.nn.DataParallel(net, device_ids=[0])
 75 |     net.cuda()
 76 |     cudnn.benchmark = True
 77 | 
 78 | optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4)
 79 | # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.25)
 80 | 
 81 | # Training
 82 | def train(epoch,prev_val_loss, last_saved):
 83 |     print('\nEpoch: %d' % epoch)
 84 |     net.train()
 85 |     train_loss = 0
 86 |     for batch_idx, (images, loc_targets, conf_targets) in enumerate(trainloader):
 87 |         if use_cuda:
 88 |             images = images.cuda()
 89 |             loc_targets = loc_targets.cuda()
 90 |             conf_targets = conf_targets.cuda()
 91 | 
 92 |         images = torch.tensor(images)
 93 |         loc_targets = torch.tensor(loc_targets)
 94 |         conf_targets = torch.tensor(conf_targets)
 95 | 
 96 |         optimizer.zero_grad()
 97 |         loc_preds, conf_preds = net(images)
 98 |         loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets)
 99 |         loss.backward()
100 |         optimizer.step()
101 |         # scheduler.step()
102 | 
103 |         train_loss += loss.item()
104 | 
105 |         if batch_idx%100 == 0:
106 |             val_loss_tot = 0
107 |             for batch_idx_val, (images, loc_targets, conf_targets) in enumerate(valloader):
108 |                 if use_cuda:
109 |                     images = images.cuda()
110 |                     loc_targets = loc_targets.cuda()
111 |                     conf_targets = conf_targets.cuda()
112 | 
113 |                 images = torch.tensor(images)
114 |                 loc_targets = torch.tensor(loc_targets)
115 |                 conf_targets = torch.tensor(conf_targets)
116 | 
117 |                 loc_preds, conf_preds = net(images)
118 |                 val_loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets)
119 |                 val_loss_tot += val_loss.item()
120 | 
121 |             val_loss_tot /= (batch_idx_val+1)
122 |             if val_loss_tot < prev_val_loss:
123 |                 os.makedirs('checkpoint', exist_ok=True)
124 |                 torch.save({
125 |                     'epoch': epoch,
126 |                     'net': net.state_dict(), 
127 |                     'loss': loss,
128 |                 }, 'checkpoint/ckpt.pth')
129 |                 print("Saved.")
130 |                 prev_val_loss = val_loss_tot
131 |                 last_saved = [epoch, batch_idx]
132 | 
133 |         print('epoch: {}, batch_idx: {},loss: {}, train_loss: {}, best_val_loss: {}, last_saved: {}'.format(epoch, batch_idx, loss.item(), train_loss/(batch_idx+1), prev_val_loss, last_saved))
134 | 
135 |     return prev_val_loss, last_saved
136 | 
137 | 
138 | prev_val_loss = 999
139 | last_saved = [start_epoch,0]
140 | for epoch_num in range(start_epoch, start_epoch+epoch):
141 |     prev_val_loss, last_saved = train(epoch_num, prev_val_loss, last_saved)
142 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PyTorch_Object_Detection
 2 | This repository contains Pytorch implementations of single-shot approaches for object detection in images.
 3 | 
 4 | ## Papers
 5 | - [SSD: Single Shot MultiBox Detector](https://www.cs.unc.edu/~wliu/papers/ssd.pdf)
 6 | - [MDSSD: Multi-scale Deconvolutional SingleShot Detector for Small Objects](https://arxiv.org/pdf/1805.07009.pdf)
 7 | - [Augmentation for small object detection](https://arxiv.org/pdf/1902.07296.pdf)
 8 | - [CFENet: An Accurate and Efficient Single-Shot Object Detector for Autonomous Driving](https://arxiv.org/pdf/1806.09790.pdf)
 9 | - [Comprehensive Feature Enhancement Module for Single-Shot Object Detector](https://qijiezhao.github.io/imgs/cfenetv1.pdf)
10 | 


--------------------------------------------------------------------------------
/SSD/scripts/convert_vgg.py:
--------------------------------------------------------------------------------
 1 | '''Convert pretrained VGG model to SSD.
 2 | 
 3 | VGG model download from PyTorch model zoo: https://download.pytorch.org/models/vgg16-397923af.pth
 4 | '''
 5 | import sys
 6 | sys.path.append("../src")
 7 | import torch
 8 | 
 9 | from ssd import SSD300
10 | 
11 | 
12 | vgg = torch.load('../model/vgg16-397923af.pth')
13 | 
14 | ssd = SSD300()
15 | layer_indices = [0,2,5,7,10,12,14,17,19,21]
16 | 
17 | for layer_idx in layer_indices:
18 |     ssd.base[layer_idx].weight.data = vgg['features.%d.weight' % layer_idx]
19 |     ssd.base[layer_idx].bias.data = vgg['features.%d.bias' % layer_idx]
20 | 
21 | # [24,26,28]
22 | ssd.conv5_1.weight.data = vgg['features.24.weight']
23 | ssd.conv5_1.bias.data = vgg['features.24.bias']
24 | ssd.conv5_2.weight.data = vgg['features.26.weight']
25 | ssd.conv5_2.bias.data = vgg['features.26.bias']
26 | ssd.conv5_3.weight.data = vgg['features.28.weight']
27 | ssd.conv5_3.bias.data = vgg['features.28.bias']
28 | for k in ssd.state_dict():
29 |     print(k)
30 | torch.save(ssd.state_dict(), '../model/ssd.pth')
31 | 


--------------------------------------------------------------------------------
/SSD/scripts/datagen.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Load image/class/box from a annotation file.
  3 | 
  4 | '''
  5 | from __future__ import print_function
  6 | 
  7 | import os
  8 | import sys
  9 | import os.path
 10 | 
 11 | import random
 12 | import numpy as np
 13 | 
 14 | import torch
 15 | import torch.utils.data as data
 16 | import torchvision.transforms as transforms
 17 | 
 18 | from encoder import DataEncoder
 19 | import cv2
 20 | 
 21 | class ListDataset(data.Dataset):
 22 |     img_size = 300
 23 | 
 24 |     def __init__(self, root, list_file, train, transform):
 25 |         '''
 26 |         Args:
 27 |           root: (str) ditectory to images.
 28 |           list_file: (str) path to annotation files.
 29 |           train: (boolean) train or test.
 30 |           transform: ([transforms]) image transforms.
 31 |         '''
 32 |         self.root = root
 33 |         self.train = train
 34 |         self.transform = transform
 35 | 
 36 |         self.fnames = []
 37 |         self.boxes = []
 38 |         self.labels = []
 39 | 
 40 |         self.data_encoder = DataEncoder()
 41 |         self.num_samples = 0
 42 | 
 43 |         for i in os.listdir(list_file):
 44 |             self.num_samples += 1
 45 |             self.fnames.append(i)
 46 |             box = []
 47 |             labels = []
 48 |             with open(os.path.join(list_file,i)) as f:
 49 |                 f = f.read().split("\n")
 50 |                 f = f[:-1]
 51 |             num_objs = len(f)
 52 | 
 53 |             for j in range(num_objs):
 54 |                 f[j] = f[j].split(",")
 55 |                 xmin = float(f[j][0])
 56 |                 ymin = float(f[j][1])
 57 |                 w = float(f[j][2])
 58 |                 h = float(f[j][3])
 59 | 
 60 |                 box.append([xmin,ymin,xmin+h,ymin+h])
 61 |                 labels.append(int(f[j][5]))
 62 |         
 63 |             self.boxes.append(torch.Tensor(box))
 64 |             self.labels.append(torch.LongTensor(labels))
 65 |             
 66 |     def __getitem__(self, idx):
 67 |         '''Load a image, and encode its bbox locations and class labels.
 68 |         Args:
 69 |           idx: (int) image index.
 70 |         Returns:
 71 |           img: (tensor) image tensor.
 72 |           loc_target: (tensor) location targets, sized [8732,4].
 73 |           conf_target: (tensor) label targets, sized [8732,].
 74 |         '''
 75 |         # Load image and bbox locations.
 76 |         fname = self.fnames[idx]
 77 |         img = cv2.imread(os.path.join(self.root, fname[:-4]+".jpg"))
 78 |         boxes = self.boxes[idx].clone()
 79 |         labels = self.labels[idx]
 80 | 
 81 |         # Data augmentation while training.
 82 |         if self.train:
 83 |             img, boxes = self.random_flip(img, boxes)
 84 |             img, boxes, labels = self.random_crop(img, boxes, labels)
 85 | 
 86 |         # Scale bbox locaitons to [0,1].
 87 |         w,h = img.shape[1], img.shape[0]
 88 |         boxes /= torch.Tensor([w,h,w,h]).expand_as(boxes)
 89 |         img = cv2.resize(img, (self.img_size,self.img_size))
 90 |         img = self.transform(img)
 91 | 
 92 |         # Encode loc & conf targets.
 93 |         
 94 |         loc_target, conf_target = self.data_encoder.encode(boxes, labels)
 95 |         return img, loc_target, conf_target
 96 | 
 97 |     def random_flip(self, img, boxes):
 98 |         '''Randomly flip the image and adjust the bbox locations.
 99 |         For bbox (xmin, ymin, xmax, ymax), the flipped bbox is:
100 |         (w-xmax, ymin, w-xmin, ymax).
101 |         Args:
102 |           img: (ndarray.Image) image. f
103 |           boxes: (tensor) bbox locations, sized [#obj, 4].
104 |         Returns:
105 |           img: (ndarray.Image) randomly flipped image.
106 |           boxes: (tensor) randomly flipped bbox locations, sized [#obj, 4].
107 |         '''
108 |         if random.random() < 0.5:
109 |             img = cv2.flip(img, 1)
110 |             w = img.shape[1]
111 |             xmin = w - boxes[:,2]
112 |             xmax = w - boxes[:,0]
113 |             boxes[:,0] = xmin
114 |             boxes[:,2] = xmax
115 |         return img, boxes
116 | 
117 |     def random_crop(self, img, boxes, labels):
118 |         '''Randomly crop the image and adjust the bbox locations.
119 |         For more details, see 'Chapter2.2: Data augmentation' of the paper.
120 |         Args:
121 |           img: (ndarray.Image) image.
122 |           boxes: (tensor) bbox locations, sized [#obj, 4].
123 |           labels: (tensor) bbox labels, sized [#obj,].
124 |         Returns:
125 |           img: (ndarray.Image) cropped image.
126 |           selected_boxes: (tensor) selected bbox locations.
127 |           labels: (tensor) selected bbox labels.
128 |         '''
129 |         imw, imh = img.shape[1], img.shape[0]
130 |         while True:
131 |             min_iou = random.choice([None, 0.1, 0.3, 0.5, 0.7, 0.9])# random choice the one 
132 |             if min_iou is None:
133 |                 return img, boxes, labels
134 | 
135 |             for _ in range(100):
136 |                 w = random.randrange(int(0.1*imw), imw)
137 |                 h = random.randrange(int(0.1*imh), imh)
138 | 
139 |                 if h > 2*w or w > 2*h or h < 1 or w < 1:
140 |                     continue
141 | 
142 |                 x = random.randrange(imw - w)
143 |                 y = random.randrange(imh - h)
144 |                 roi = torch.Tensor([[x, y, x+w, y+h]])
145 |                 
146 |                 center = (boxes[:,:2] + boxes[:,2:]) / 2  # [N,2]
147 |                 roi2 = roi.expand(len(center), 4)  # [N,4]
148 |     
149 |                 mask = (center > roi2[:,:2]) & (center < roi2[:,2:])  # [N,2]
150 |                 mask = mask[:,0] & mask[:,1]  #[N,]
151 | 
152 |                 if not mask.any():
153 |                     continue
154 |               
155 |                 selected_boxes = boxes.index_select(0, mask.nonzero().squeeze(1))
156 |                 
157 |                 iou = self.data_encoder.iou(selected_boxes, roi)
158 |                 if iou.min() < min_iou:
159 |                     continue
160 |                 img = img[y:y+h, x:x+w, :]
161 |                 
162 |                 selected_boxes[:,0].add_(-x).clamp_(min=0, max=w)
163 |                 selected_boxes[:,1].add_(-y).clamp_(min=0, max=h)
164 |                 selected_boxes[:,2].add_(-x).clamp_(min=0, max=w)
165 |                 selected_boxes[:,3].add_(-y).clamp_(min=0, max=h)
166 | 
167 |                 return img, selected_boxes, labels[mask]
168 | 
169 |     def __len__(self):
170 |         return self.num_samples


--------------------------------------------------------------------------------
/SSD/scripts/encoder.py:
--------------------------------------------------------------------------------
  1 | '''Encode target locations and labels.'''
  2 | import torch
  3 | 
  4 | import math
  5 | import itertools
  6 | 
  7 | class DataEncoder:
  8 |     def __init__(self):
  9 |         '''Compute default box sizes with scale and aspect transform.'''
 10 |         scale = 300.
 11 |         steps = [s / scale for s in (8, 16, 32, 64, 100, 300)]
 12 |         sizes = [s / scale for s in (30, 60, 111, 162, 213, 264, 315)]
 13 |         aspect_ratios = ((2,), (2,3), (2,3), (2,3), (2,), (2,))
 14 |         feature_map_sizes = (38, 19, 10, 5, 3, 1)
 15 |         num_layers = len(feature_map_sizes)
 16 | 
 17 |         boxes = []
 18 |         for i in range(num_layers):
 19 |             fmsize = feature_map_sizes[i] # feature map size 
 20 |             for h,w in itertools.product(range(fmsize), repeat=2):
 21 |                 cx = (w + 0.5)*steps[i]
 22 |                 cy = (h + 0.5)*steps[i]
 23 | 
 24 |                 s = sizes[i]
 25 |                 boxes.append((cx, cy, s, s))
 26 | 
 27 |                 s = math.sqrt(sizes[i] * sizes[i+1])
 28 |                 boxes.append((cx, cy, s, s))
 29 | 
 30 |                 s = sizes[i]
 31 |                 for ar in aspect_ratios[i]:
 32 |                     boxes.append((cx, cy, s * math.sqrt(ar), s / math.sqrt(ar)))
 33 |                     boxes.append((cx, cy, s / math.sqrt(ar), s * math.sqrt(ar)))
 34 | 
 35 |         self.default_boxes = torch.Tensor(boxes)
 36 | 
 37 |     def iou(self, box1, box2):
 38 |         '''Compute the intersection over union of two set of boxes, each box is [x1,y1,x2,y2].
 39 | 
 40 |         Args:
 41 |           box1: (tensor) bounding boxes, sized [N,4].
 42 |           box2: (tensor) bounding boxes, sized [M,4].
 43 | 
 44 |         Return:
 45 |           (tensor) iou, sized [N,M].
 46 |         '''
 47 |         N = box1.size(0)
 48 |         M = box2.size(0)
 49 | 
 50 |         lt = torch.max(
 51 |             box1[:,:2].unsqueeze(1).expand(N,M,2),  # [N,2] -> [N,1,2] -> [N,M,2]
 52 |             box2[:,:2].unsqueeze(0).expand(N,M,2),  # [M,2] -> [1,M,2] -> [N,M,2]
 53 |         )
 54 | 
 55 |         rb = torch.min(
 56 |             box1[:,2:].unsqueeze(1).expand(N,M,2),  # [N,2] -> [N,1,2] -> [N,M,2]
 57 |             box2[:,2:].unsqueeze(0).expand(N,M,2),  # [M,2] -> [1,M,2] -> [N,M,2]
 58 |         )
 59 | 
 60 |         wh = rb - lt  # [N,M,2]
 61 |         wh[wh<0] = 0  # clip at 0
 62 |         inter = wh[:,:,0] * wh[:,:,1]  # [N,M]
 63 | 
 64 |         area1 = (box1[:,2]-box1[:,0]) * (box1[:,3]-box1[:,1])  # [N,]
 65 |         area2 = (box2[:,2]-box2[:,0]) * (box2[:,3]-box2[:,1])  # [M,]
 66 |         area1 = area1.unsqueeze(1).expand_as(inter)  # [N,] -> [N,1] -> [N,M]
 67 |         area2 = area2.unsqueeze(0).expand_as(inter)  # [M,] -> [1,M] -> [N,M]
 68 | 
 69 |         iou = inter / (area1 + area2 - inter)
 70 |         return iou
 71 | 
 72 |     def encode(self, boxes, classes, threshold=0.5):
 73 |         '''Transform target bounding boxes and class labels to SSD boxes and classes.
 74 | 
 75 |         Match each object box to all the default boxes, pick the ones with the
 76 |         Jaccard-Index > 0.5:
 77 |             Jaccard(A,B) = AB / (A+B-AB)
 78 | 
 79 |         Args:
 80 |           boxes: (tensor) object bounding boxes (xmin,ymin,xmax,ymax) of a image, sized [#obj, 4].
 81 |           classes: (tensor) object class labels of a image, sized [#obj,].
 82 |           threshold: (float) Jaccard index threshold
 83 | 
 84 |         Returns:
 85 |           boxes: (tensor) bounding boxes, sized [#obj, 8732, 4].
 86 |           classes: (tensor) class labels, sized [8732,]
 87 |         '''
 88 |         default_boxes = self.default_boxes
 89 |         num_default_boxes = default_boxes.size(0)
 90 |         num_objs = boxes.size(0)
 91 | 
 92 |         iou = self.iou(  # [#obj,8732]
 93 |             boxes,
 94 |             torch.cat([default_boxes[:,:2] - default_boxes[:,2:]/2,
 95 |                        default_boxes[:,:2] + default_boxes[:,2:]/2], 1)
 96 |         )
 97 | 
 98 |         iou, max_idx = iou.max(0)  # [1,8732]
 99 |         max_idx.squeeze_(0)        # [8732,]
100 |         iou.squeeze_(0)            # [8732,]
101 | 
102 |         boxes = boxes[max_idx]     # [8732,4]
103 |         variances = [0.1, 0.2]
104 |         cxcy = (boxes[:,:2] + boxes[:,2:])/2 - default_boxes[:,:2]  # [8732,2]
105 |         cxcy /= variances[0] * default_boxes[:,2:]
106 |         wh = (boxes[:,2:] - boxes[:,:2]) / default_boxes[:,2:]      # [8732,2]
107 |         wh = torch.log(wh) / variances[1]
108 |         loc = torch.cat([cxcy, wh], 1)  # [8732,4]
109 | 
110 |         conf = 1 + classes[max_idx]   # [8732,], background class = 0
111 |         conf[iou<threshold] = 0       # background
112 |         return loc, conf
113 | 
114 |     def nms(self, bboxes, scores, threshold=0.3, mode='union'):
115 |         '''Non maximum suppression.
116 |         Args:
117 |           bboxes: (tensor) bounding boxes, sized [N,4].
118 |           scores: (tensor) bbox scores, sized [N,].
119 |           threshold: (float) overlap threshold.
120 |           mode: (str) 'union' or 'min'.
121 | 
122 |         Returns:
123 |           keep: (tensor) selected indices.
124 | 
125 |         Ref:
126 |           https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py
127 |         '''
128 |         x1 = bboxes[:,0]
129 |         y1 = bboxes[:,1]
130 |         x2 = bboxes[:,2]
131 |         y2 = bboxes[:,3]
132 | 
133 |         areas = (x2-x1) * (y2-y1)
134 |         _, order = scores.sort(0, descending=True)
135 | 
136 |         keep = []
137 |         while order.numel() > 0: 
138 |             i = order[0]
139 |             keep.append(i)
140 | 
141 |             if order.numel() == 1:
142 |                 break
143 | 
144 |             xx1 = x1[order[1:]].clamp(min=x1[i])
145 |             yy1 = y1[order[1:]].clamp(min=y1[i])
146 |             xx2 = x2[order[1:]].clamp(max=x2[i])
147 |             yy2 = y2[order[1:]].clamp(max=y2[i])
148 | 
149 |             w = (xx2-xx1).clamp(min=0)
150 |             h = (yy2-yy1).clamp(min=0)
151 |             inter = w*h
152 | 
153 |             if mode == 'union':
154 |                 ovr = inter / (areas[i] + areas[order[1:]] - inter)
155 |             elif mode == 'min':
156 |                 ovr = inter / areas[order[1:]].clamp(max=areas[i])
157 |             else:
158 |                 raise TypeError('Unknown nms mode: %s.' % mode)
159 | 
160 |             ids = (ovr<=threshold).nonzero().squeeze()
161 |             if ids.numel() == 0:
162 |                 break
163 |             order = order[ids+1]
164 |         return torch.LongTensor(keep)
165 | 
166 |     def decode(self, loc, conf):
167 |         '''Transform predicted loc/conf back to real bbox locations and class labels.
168 | 
169 |         Args:
170 |           loc: (tensor) predicted loc, sized [8732,4].
171 |           conf: (tensor) predicted conf, sized [8732,21].
172 | 
173 |         Returns:
174 |           boxes: (tensor) bbox locations, sized [#obj, 4].
175 |           labels: (tensor) class labels, sized [#obj,1].
176 |         '''
177 |         variances = (0.1, 0.2)
178 |         wh = torch.exp(loc[:,2:]*variances[1]) * self.default_boxes[:,2:]
179 |         cxcy = loc[:,:2] * variances[0] * self.default_boxes[:,2:] + self.default_boxes[:,:2]
180 |         box_preds = torch.cat([cxcy-wh/2, cxcy+wh/2], 1)  # [8732,4]
181 | 
182 |         boxes = []
183 |         labels = []
184 |         scores = []
185 |         num_classes = conf.size(1)
186 |         for i in range(num_classes-1):
187 |             score = conf[:,i+1]  # class i corresponds to (i+1) column
188 |             mask = score > 0.1
189 |             
190 |             if not mask.any():
191 |                 continue
192 |            
193 |             box = box_preds[mask.nonzero().squeeze()]
194 |             score = score[mask]
195 |             
196 |             if len(score) == 1:
197 |                 continue
198 |             keep = self.nms(box, score, threshold=0.3)
199 |             boxes.append(box[keep])
200 |             labels.append(torch.LongTensor(len(box[keep])).fill_(i))
201 |             scores.append(score[keep])
202 | 
203 |         return boxes, labels, scores
204 | 


--------------------------------------------------------------------------------
/SSD/scripts/multibox_layer.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import math
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.init as init
 8 | import torch.nn.functional as F
 9 | 
10 | 
11 | class MultiBoxLayer(nn.Module):
12 |     num_classes = 21
13 |     num_anchors = [4,6,6,6,4,4]
14 |     in_planes = [512,1024,512,256,256,256]
15 | 
16 |     def __init__(self):
17 |         super(MultiBoxLayer, self).__init__()
18 | 
19 |         self.loc_layers = nn.ModuleList()
20 |         self.conf_layers = nn.ModuleList()
21 |         for i in range(len(self.in_planes)):
22 |         	self.loc_layers.append(nn.Conv2d(self.in_planes[i], self.num_anchors[i]*4, kernel_size=3, padding=1))
23 |         	self.conf_layers.append(nn.Conv2d(self.in_planes[i], self.num_anchors[i]*21, kernel_size=3, padding=1))
24 | 
25 |     def forward(self, xs):
26 |         '''
27 |         Args:
28 |           xs: (list) of tensor containing intermediate layer outputs.
29 | 
30 |         Returns:
31 |           loc_preds: (tensor) predicted locations, sized [N,8732,4].
32 |           conf_preds: (tensor) predicted class confidences, sized [N,8732,21].
33 |         '''
34 |         y_locs = []
35 |         y_confs = []
36 |         for i, x in enumerate(xs):
37 |             y_loc = self.loc_layers[i](x)
38 |             N = y_loc.size(0)
39 |             y_loc = y_loc.permute(0,2,3,1).contiguous()
40 |             y_loc = y_loc.view(N,-1,4)
41 |             y_locs.append(y_loc)
42 | 
43 |             y_conf = self.conf_layers[i](x)
44 |             y_conf = y_conf.permute(0,2,3,1).contiguous()
45 |             y_conf = y_conf.view(N,-1,21)
46 |             y_confs.append(y_conf)
47 | 
48 |         loc_preds = torch.cat(y_locs, 1)
49 |         conf_preds = torch.cat(y_confs, 1)
50 |         return loc_preds, conf_preds
51 | 


--------------------------------------------------------------------------------
/SSD/scripts/multibox_loss.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import math
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.init as init
  8 | import torch.nn.functional as F
  9 | 
 10 | 
 11 | class MultiBoxLoss(nn.Module):
 12 |     num_classes = 21
 13 | 
 14 |     def __init__(self):
 15 |         super(MultiBoxLoss, self).__init__()
 16 | 
 17 |     def cross_entropy_loss(self, x, y):
 18 |         '''Cross entropy loss w/o averaging across all samples.
 19 | 
 20 |         Args:
 21 |           x: (tensor) sized [N,D].
 22 |           y: (tensor) sized [N,].
 23 | 
 24 |         Return:
 25 |           (tensor) cross entroy loss, sized [N,].
 26 |         '''
 27 |         xmax = x.data.max()
 28 |         print('x y size {} {}'.format(x.size(), y.size()))
 29 |         log_sum_exp = torch.log(torch.sum(torch.exp(x-xmax), 1)) + xmax
 30 |         print('log_sum_exp {}'.format(log_sum_exp.size()))
 31 |         return log_sum_exp - x.gather(1, y.view(-1,1))
 32 | 
 33 |     def hard_negative_mining(self, conf_loss, pos):
 34 |         '''Return negative indices that is 3x the number as postive indices.
 35 | 
 36 |         Args:
 37 |           conf_loss: (tensor) cross entroy loss between conf_preds and conf_targets, sized [N*8732,].
 38 |           pos: (tensor) positive(matched) box indices, sized [N,8732].
 39 | 
 40 |         Return:
 41 |           (tensor) negative indices, sized [N,8732].
 42 |         '''
 43 |         batch_size, num_boxes = pos.size()
 44 |         conf_loss[pos.view(-1)] = 0  # set pos boxes = 0, the rest are neg conf_loss
 45 |         conf_loss = conf_loss.view(batch_size, -1)  # [N,8732]
 46 | 
 47 |         _,idx = conf_loss.sort(1, descending=True)  # sort by neg conf_loss
 48 |         _,rank = idx.sort(1)  # [N,8732]
 49 | 
 50 |         num_pos = pos.long().sum(1)  # [N,1]
 51 |         num_neg = torch.clamp(3*num_pos, max=num_boxes-1)  # [N,1]
 52 | 
 53 |         neg = rank < num_neg.unsqueeze(1).expand_as(rank)
 54 | 
 55 |         return neg
 56 | 
 57 |     def forward(self, loc_preds, loc_targets, conf_preds, conf_targets):
 58 |         '''Compute loss between (loc_preds, loc_targets) and (conf_preds, conf_targets).
 59 | 
 60 |         Args:
 61 |           loc_preds: (tensor) predicted locations, sized [batch_size, 8732, 4].
 62 |           loc_targets: (tensor) encoded target locations, sized [batch_size, 8732, 4].
 63 |           conf_preds: (tensor) predicted class confidences, sized [batch_size, 8732, num_classes].
 64 |           conf_targets: (tensor) encoded target classes, sized [batch_size, 8732].
 65 | 
 66 |         loss:
 67 |           (tensor) loss = SmoothL1Loss(loc_preds, loc_targets) + CrossEntropyLoss(conf_preds, conf_targets).
 68 |         '''
 69 |         batch_size, num_boxes, _ = loc_preds.size()
 70 |         pos = conf_targets > 0  # [N,8732], pos means the box matched.
 71 |         num_matched_boxes = pos.data.float().sum()
 72 |         if num_matched_boxes == 0:
 73 |             return torch.tensor([0], requires_grad=True)
 74 |         
 75 |         ################################################################
 76 |         # loc_loss = SmoothL1Loss(pos_loc_preds, pos_loc_targets)
 77 |         ################################################################
 78 |         pos_mask = pos.unsqueeze(2).expand_as(loc_preds)    # [N,8732,4]
 79 |         pos_loc_preds = loc_preds[pos_mask].view(-1,4)      # [#pos,4]
 80 |         pos_loc_targets = loc_targets[pos_mask].view(-1,4)  # [#pos,4]
 81 |         
 82 |         loc_loss = F.smooth_l1_loss(pos_loc_preds, pos_loc_targets, size_average=False)
 83 | 
 84 |         ################################################################
 85 |         # conf_loss = CrossEntropyLoss(pos_conf_preds, pos_conf_targets)
 86 |         #           + CrossEntropyLoss(neg_conf_preds, neg_conf_targets)
 87 |         ################################################################
 88 |         conf_loss = F.cross_entropy(conf_preds.view(-1,self.num_classes), \
 89 |                                             conf_targets.view(-1), reduce=False)  # [N*8732,]
 90 |         neg = self.hard_negative_mining(conf_loss, pos)    # [N,8732]
 91 | 
 92 |         pos_mask = pos.unsqueeze(2).expand_as(conf_preds)  # [N,8732,21]
 93 |         neg_mask = neg.unsqueeze(2).expand_as(conf_preds)  # [N,8732,21]
 94 |         mask = (pos_mask+neg_mask).gt(0)
 95 | 
 96 |         pos_and_neg = (pos+neg).gt(0)
 97 |         preds = conf_preds[mask].view(-1,self.num_classes)  # [#pos+#neg,21]
 98 |         targets = conf_targets[pos_and_neg]                 # [#pos+#neg,]
 99 |         conf_loss = F.cross_entropy(preds, targets, size_average=False)
100 | 
101 |         loc_loss /= num_matched_boxes
102 |         conf_loss /= num_matched_boxes
103 | 
104 |         return loc_loss + conf_loss
105 | 


--------------------------------------------------------------------------------
/SSD/scripts/ssd.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import itertools
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | import torch.nn.init as init
  8 | 
  9 | from torch.autograd import Variable
 10 | 
 11 | from multibox_layer import MultiBoxLayer
 12 | 
 13 | 
 14 | class L2Norm(nn.Module):
 15 |     '''L2Norm layer across all channels and scale.'''
 16 |     def __init__(self, in_features,scale):
 17 |         super(L2Norm, self).__init__()
 18 |         self.weight = nn.Parameter(torch.Tensor(in_features))
 19 |         self.reset_parameters(scale)
 20 | 
 21 |     def reset_parameters(self, scale):
 22 |         nn.init.constant_(self.weight, scale)
 23 | 
 24 |     def forward(self, x):
 25 |         x = F.normalize(x, dim=1)
 26 |         scale = self.weight[None,:,None,None]
 27 |         return scale * x
 28 | 
 29 | 
 30 | class SSD300(nn.Module):
 31 |     input_size = 300
 32 | 
 33 |     def __init__(self):
 34 |         super(SSD300, self).__init__()
 35 |         
 36 |         # model
 37 |         self.base = self.VGG16()
 38 |         self.norm4 = L2Norm(512, 20) # 38
 39 | 
 40 |         self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1)
 41 |         self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1)
 42 |         self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1, dilation=1)
 43 | 
 44 |         self.conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
 45 | 
 46 |         self.conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 
 47 | 
 48 |         self.conv8_1 = nn.Conv2d(1024, 256, kernel_size=1)
 49 |         self.conv8_2 = nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2)
 50 | 
 51 |         self.conv9_1 = nn.Conv2d(512, 128, kernel_size=1)
 52 |         self.conv9_2 = nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2)
 53 | 
 54 |         self.conv10_1 = nn.Conv2d(256, 128, kernel_size=1)
 55 |         self.conv10_2 = nn.Conv2d(128, 256, kernel_size=3)
 56 | 
 57 |         self.conv11_1 = nn.Conv2d(256, 128, kernel_size=1)
 58 |         self.conv11_2 = nn.Conv2d(128, 256, kernel_size=3)
 59 | 
 60 |         # multibox layer
 61 |         self.multibox = MultiBoxLayer()
 62 | 
 63 |     def forward(self, x):
 64 |         hs = []
 65 | 
 66 |         h = self.base(x)
 67 |         hs.append(self.norm4(h))  # conv4_3
 68 |         h = F.max_pool2d(h, kernel_size=2, stride=2, ceil_mode=True)
 69 | 
 70 |         h = F.relu(self.conv5_1(h))
 71 |         h = F.relu(self.conv5_2(h))
 72 |         h = F.relu(self.conv5_3(h))
 73 |         h = F.max_pool2d(h, kernel_size=3, padding=1, stride=1, ceil_mode=True)
 74 |         
 75 |         h = F.relu(self.conv6(h))
 76 |         h = F.relu(self.conv7(h))
 77 |         hs.append(h)  # conv7
 78 |         h = F.relu(self.conv8_1(h))
 79 |         h = F.relu(self.conv8_2(h))
 80 |         hs.append(h)  # conv8_2
 81 |         h = F.relu(self.conv9_1(h))
 82 |         h = F.relu(self.conv9_2(h))
 83 |         hs.append(h)  # conv9_2
 84 |         h = F.relu(self.conv10_1(h))
 85 |         h = F.relu(self.conv10_2(h))
 86 |         hs.append(h)  # conv10_2
 87 |         h = F.relu(self.conv11_1(h))
 88 |         h = F.relu(self.conv11_2(h))
 89 |         hs.append(h)  # conv11_2
 90 |         loc_preds, conf_preds = self.multibox(hs)
 91 |  
 92 |         return loc_preds, conf_preds
 93 | 
 94 |     def VGG16(self):
 95 |         '''VGG16 layers.'''
 96 |         cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512]
 97 |         layers = []
 98 |         in_channels = 3
 99 |         for x in cfg:
100 |             if x == 'M':
101 |                 layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
102 |             else:
103 |                 layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
104 |                            nn.ReLU(True)]
105 |                 in_channels = x
106 |         return nn.Sequential(*layers)
107 | 
108 | if __name__ == '__main__':
109 |     t = torch.randn(1, 3, 300, 300)
110 |     net = SSD300()
111 |     res = net.forward(t)
112 |     


--------------------------------------------------------------------------------
/SSD/scripts/train.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import os
 4 | import argparse
 5 | import itertools
 6 | 
 7 | import torch
 8 | import torch.nn as nn
 9 | import torch.optim as optim
10 | import torch.nn.functional as F
11 | import torch.backends.cudnn as cudnn
12 | 
13 | import torchvision
14 | import torchvision.transforms as transforms
15 | 
16 | import numpy as np
17 | from ssd import SSD300
18 | from datagen import ListDataset
19 | from multibox_loss import MultiBoxLoss
20 | 
21 | lr = 0.001
22 | resume = False  # Resume from checkpoint
23 | epoch = 200
24 | 
25 | use_cuda = torch.cuda.is_available()
26 | best_loss = float('inf')  # best test loss
27 | start_epoch = 0  # start from epoch 0 or last epoch
28 | 
29 | # Data
30 | print('==> Preparing data..')
31 | transform = transforms.Compose([transforms.ToTensor(),
32 |                             transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))])
33 | 
34 | # trainset = ListDataset(root='../data/train/images/', list_file='../data/train/annotations/', train=True, transform=transform)
35 | # trainloader = torch.utils.data.DataLoader(trainset, batch_size=8, shuffle=True, num_workers=4)
36 | 
37 | # Model
38 | net = SSD300()
39 | # if resume:
40 | #     print('==> Resuming from checkpoint..')
41 | #     checkpoint = torch.load('../checkpoint/ckpt.pth')
42 | #     net.load_state_dict(checkpoint['net'])
43 | #     best_loss = checkpoint['loss']
44 | #     start_epoch = checkpoint['epoch']
45 | # else:
46 | #     # Convert from pretrained VGG model.
47 | #     try:
48 | #         net.load_state_dict(torch.load('../model/ssd.pth'))
49 | #         print('==> Pretrain model read successfully')
50 | #     except:
51 | #         print('==> Pretrain model read failed or not existed, training from init')
52 | print("Loaded Model")
53 | 
54 | 
55 | 
56 | # criterion = MultiBoxLoss()
57 | 
58 | # if use_cuda:
59 | #     net = torch.nn.DataParallel(net, device_ids=[0])
60 | #     net.cuda()
61 | #     cudnn.benchmark = True
62 | 
63 | # optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4)
64 | 
65 | # # Training
66 | # def train(epoch):
67 | #     print('\nEpoch: %d' % epoch)
68 | #     net.train()
69 | #     train_loss = 0
70 | #     # for batch_idx, (images, loc_targets, conf_targets) in enumerate(trainloader):
71 | #     #     if use_cuda:
72 | #     #         images = images.cuda()
73 | #     #         loc_targets = loc_targets.cuda()
74 | #     #         conf_targets = conf_targets.cuda()
75 | 
76 | #     #     images = torch.tensor(images)
77 | #     #     loc_targets = torch.tensor(loc_targets)
78 | #     #     conf_targets = torch.tensor(conf_targets)
79 | 
80 | #     #     optimizer.zero_grad()
81 | #     #     loc_preds, conf_preds = net(images)
82 | #     #     loss = criterion(loc_preds, loc_targets, conf_preds, conf_targets)
83 | #     #     loss.backward()
84 | #     #     optimizer.step()
85 | 
86 | #     #     train_loss += loss.item()
87 | #     #     if batch_idx%100 == 0:
88 | #     #         os.makedirs('checkpoint', exist_ok=True)
89 | #     #         torch.save({
90 | #     #             'epoch': epoch,
91 | #     #             'net': net.module.state_dict(), 
92 | #     #             'loss': loss,
93 | #     #         }, 'checkpoint/ckpt.pth')
94 | #     #     print('epoch: {}, batch_idx: {},loss: {}, train_loss: {}'.format(epoch, batch_idx, loss.item(), train_loss/(batch_idx+1)))
95 | 
96 | 
97 | # for epoch_num in range(1):#range(start_epoch, start_epoch+epoch):
98 | #     train(epoch_num)


--------------------------------------------------------------------------------