├── .gitignore
├── README.md
├── dataset
    ├── __pycache__
    │   ├── bosch.cpython-36.pyc
    │   ├── data_preprocessing.cpython-36.pyc
    │   └── transforms.cpython-36.pyc
    ├── bosch.py
    ├── data_preprocessing.py
    └── transforms.py
├── main.py
├── models
    └── bosch-dataset-labels.txt
└── network
    ├── FlowNet.py
    ├── MultiBoxLoss.py
    ├── RefineDet.py
    ├── __pycache__
        ├── FlowNet.cpython-36.pyc
        ├── MultiBoxLoss.cpython-36.pyc
        ├── RefineDet.cpython-36.pyc
        ├── box_utils.cpython-36.pyc
        ├── network_utils.cpython-36.pyc
        └── prior_box.cpython-36.pyc
    ├── box_utils.py
    ├── network_utils.py
    └── prior_box.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Video-Object-Detection (Still in progress)
 2 | ## Description
 3 | Based on the paper: "Towards High Performance Video Object Detection" use Pytorch 0.4.1 and Python 3.6
 4 | 
 5 | The model is currently running on Bosch Traffic Light Dataset only, but it will be easy to add another dataset by modifying dataloader.
 6 | 
 7 | For training simply use 'python main.py' and set args according to your need.
 8 | 
 9 | ## Reference Links
10 | The RefineDet's code is inspired by https://github.com/lzx1413/PytorchSSD.
11 | 
12 | The vgg pretrained model is downloaded from https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth.
13 | 
14 | The FlowNet's code and pretrained model are inspired and downloaded from https://github.com/NVIDIA/flownet2-pytorch.
15 | 
16 | Please download the above 2 pretrained models and place them into 'models' directory.
17 | 
18 | ## Others
19 | Some self-understanding about the paper's design (may be some are wrong, I will go deeper into those points later):
20 | * In training process, two frames are randomly selected with former one as key-frame and later one as non-key-frame.
21 | * The "q_propagate" factor is normalized to 0-1 during training, in case for preserving gradient and allow backward propagating. In inference process, it should be either 0 or 1.
22 | * is_aggr and is_partial flag is both set to True for all frame-pairs during training since each batch has few key-frame. In inference process they should be treated differently.
23 | 
24 | Some self-modification:
25 | * I use RefineDet instead of single ResNet as base detection network. Therefore, the results of flownet is also used in some middle source layers in addition to final layer.
26 | 
27 | TODO:
28 | * add multiple gpu's support. (Forgive I am a beginner to Pytorch orz.)
29 | * optimize the distribution of tensors that on CPU or GPU.
30 | * add inference part.
31 | * ...
32 | 


--------------------------------------------------------------------------------
/dataset/__pycache__/bosch.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zzzkkkyyy/Video-Object-Detection/bc57ece447620be4b2980daa82b187a391a893a1/dataset/__pycache__/bosch.cpython-36.pyc


--------------------------------------------------------------------------------
/dataset/__pycache__/data_preprocessing.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zzzkkkyyy/Video-Object-Detection/bc57ece447620be4b2980daa82b187a391a893a1/dataset/__pycache__/data_preprocessing.cpython-36.pyc


--------------------------------------------------------------------------------
/dataset/__pycache__/transforms.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zzzkkkyyy/Video-Object-Detection/bc57ece447620be4b2980daa82b187a391a893a1/dataset/__pycache__/transforms.cpython-36.pyc


--------------------------------------------------------------------------------
/dataset/bosch.py:
--------------------------------------------------------------------------------
  1 | import torch, torchvision
  2 | import numpy as np
  3 | from torch.utils.data import Dataset
  4 | import yaml, os, sys, time, io
  5 | from PIL import Image
  6 | 
  7 | # BOSCH_ROOT = os.path.join(HOME, 'Bosch_Dataset/')
  8 | INDEX2LABEL = {0: 'None', 1: 'Green', 2: 'GreenStraightLeft', 3: 'GreenLeft', 4: 'RedLeft', 5: 'GreenStraightRight',
  9 |                6: 'Red', 7: 'off', \
 10 |                8: 'GreenRight', 9: 'GreenStraight', 10: 'Yellow', 11: 'RedRight', 12: 'RedStraight',
 11 |                13: 'RedStraightLeft'}
 12 | BOSCH_CLASSES = ['None', 'Green', 'GreenStraightLeft', 'GreenLeft', 'RedLeft', 'GreenStraightRight', 'Red', 'off', \
 13 |                  'GreenRight', 'GreenStraight', 'Yellow', 'RedRight', 'RedStraight', 'RedStraightLeft']
 14 | 
 15 | 
 16 | def detection_collate(batch):
 17 |     """Custom collate fn for dealing with batches of images that have a different
 18 |     number of associated object annotations (bounding boxes).
 19 | 
 20 |     Arguments:
 21 |         batch: (tuple) A tuple of tensor images and lists of annotations
 22 | 
 23 |     Return:
 24 |         A tuple containing:
 25 |             1) (tensor) batch of images stacked on their 0 dim
 26 |             2) (list of tensors) annotations for a given image are stacked on 0 dim
 27 |     """
 28 |     targets = []
 29 |     imgs = []
 30 |     for _, sample in enumerate(batch):
 31 |         for _, tup in enumerate(sample):
 32 |             if torch.is_tensor(tup):
 33 |                 imgs.append(tup)
 34 |             elif isinstance(tup, type(np.empty(0))):
 35 |                 annos = torch.from_numpy(tup).float()
 36 |                 targets.append(annos)
 37 | 
 38 |     return (torch.stack(imgs, 0), targets)
 39 | 
 40 | 
 41 | class BoschTrainDetection(Dataset):
 42 |     decode_yaml_file = None
 43 |     color2index = {}
 44 |     index2color = {}
 45 |     class_names = BOSCH_CLASSES
 46 | 
 47 |     def __init__(self, root_dir, yaml_file, transform=None, target_transform=None):
 48 |         self.root_dir = root_dir
 49 |         self.yaml_file = yaml_file
 50 |         self.transform = transform
 51 |         self.target_transform = target_transform
 52 | 
 53 |         if not os.path.exists(os.path.join(self.root_dir, self.yaml_file)):
 54 |             print(os.path.join(self.root_dir, self.yaml_file))
 55 |             print("input path parameters not valid.")
 56 |             return None
 57 | 
 58 |         self.color2index = {}
 59 |         self.index2color = {}
 60 |         color_array = []
 61 | 
 62 |         # return as a dict
 63 |         self.decode_yaml_file = yaml.load(open(os.path.join(self.root_dir, self.yaml_file)))
 64 |         self.decode_yaml_file = list(filter(lambda x: len(x['boxes']) != 0, self.decode_yaml_file))
 65 |         for item in self.decode_yaml_file:
 66 |             for box in item['boxes']:
 67 |                 if box['label'] not in color_array:
 68 |                     color_array.append(box['label'])
 69 | 
 70 |         for index in range(len(color_array)):
 71 |             self.color2index[color_array[index]] = index + 1
 72 |             self.index2color[index + 1] = color_array[index]
 73 |         self.color2index['bg'] = 0
 74 |         self.index2color[0] = 'bg'
 75 | 
 76 |         self.threshold = 0.5
 77 |         self.variance = [0.1, 0.2]
 78 | 
 79 |         self.decode_dataset = []
 80 | 
 81 |         count = 0
 82 | 
 83 |         for element in self.decode_yaml_file:
 84 |             if len(element['boxes']) == 0:
 85 |                 continue
 86 |             dataset_element = {}
 87 |             dataset_element['path'] = element['path']
 88 |             dataset_element['annotation'] = []
 89 |             for box_info in element['boxes']:
 90 |                 dataset_element['annotation'].append(np.array(
 91 |                     [box_info['x_min'], box_info['y_min'], box_info['x_max'], box_info['y_max'],
 92 |                      self.color2index[box_info['label']]]).astype(np.float32))
 93 |             dataset_element['annotation'] = np.array(dataset_element['annotation']).astype(np.float32)
 94 |             # dataset_element['annotation'] = np.transpose(np.array(dataset_element['annotation']), (1, 0))
 95 |             self.decode_dataset.append(dataset_element)
 96 | 
 97 |             count += 1
 98 |             if count == 100:
 99 |                 break
100 | 
101 |         image_name = os.path.join(self.root_dir, self.decode_dataset[0]['path'])
102 |         #print("image shape:", np.array(Image.open(image_name)).shape)
103 | 
104 |         del self.decode_yaml_file
105 | 
106 |     def __len__(self):
107 |         return len(self.decode_dataset)
108 | 
109 |     def __getitem__(self, index):
110 |         image_name = os.path.join(self.root_dir, self.decode_dataset[index]['path'])
111 |         image = Image.open(image_name)
112 |         image = np.array(image)
113 |         boxes = self.decode_dataset[index]['annotation'][:, : 4]
114 |         labels = self.decode_dataset[index]['annotation'][:, 4]
115 |         if self.transform:
116 |             image, boxes, labels = self.transform(image, boxes, labels)
117 |         if self.target_transform:
118 |             """
119 |             print(self.priors)
120 |             loc_t = torch.Tensor(1, self.priors.shape[0], 4)
121 |             conf_t = torch.LongTensor(1, self.priors.shape[0])
122 |             match(self.threshold, torch.Tensor(boxes), torch.Tensor(self.priors).float(), self.variance, torch.Tensor(labels), loc_t, conf_t, 0)
123 |             boxes, labels = loc_t[0], conf_t[0]
124 |             """
125 |             boxes, labels = self.target_transform(boxes, labels)
126 |         return image, np.concatenate((boxes, np.expand_dims(labels, -1)), 1)
127 | 
128 | 
129 | class BoschTestDetection(Dataset):
130 | 
131 |     def __init__(self):
132 |         pass
133 | 
134 |     def __len__(self):
135 |         pass
136 | 
137 |     def __getitem__(self, index):
138 |         pass
139 | em__(self, index):
140 |         pass
141 | 


--------------------------------------------------------------------------------
/dataset/data_preprocessing.py:
--------------------------------------------------------------------------------
 1 | from .transforms import *
 2 | 
 3 | 
 4 | """
 5 | ConvertFromInts(),
 6 | PhotometricDistort(),
 7 | Expand(self.mean),
 8 | RandomSampleCrop(),
 9 | RandomMirror(),
10 | """
11 | 
12 | 
13 | class TrainAugmentation:
14 |     def __init__(self, size, mean=0, std=1.0):
15 |         """
16 |         Args:
17 |             size: the size the of final image.
18 |             mean: mean pixel value per channel.
19 |         """
20 |         self.mean = mean
21 |         self.size = size
22 |         self.augment = Compose([
23 |             ToPercentCoords(),
24 |             Resize(self.size),
25 |             SubtractMeans(self.mean),
26 |             lambda img, boxes=None, labels=None: (img / 255., boxes, labels),
27 |             ToTensor(),
28 |         ])
29 | 
30 |     def __call__(self, img, boxes, labels):
31 |         """
32 | 
33 |         Args:
34 |             img: the output of cv.imread in RGB layout.
35 |             boxes: boundding boxes in the form of (x1, y1, x2, y2).
36 |             labels: labels of boxes.
37 |         """
38 |         return self.augment(img, boxes, labels)
39 | 
40 | 
41 | class TestTransform:
42 |     def __init__(self, size, mean=0.0, std=1.0):
43 |         self.transform = Compose([
44 |             ToPercentCoords(),
45 |             Resize(size),
46 |             SubtractMeans(mean),
47 |             lambda img, boxes=None, labels=None: (img / std, boxes, labels),
48 |             ToTensor(),
49 |         ])
50 | 
51 |     def __call__(self, image, boxes, labels):
52 |         return self.transform(image, boxes, labels)
53 | 
54 | 
55 | class PredictionTransform:
56 |     def __init__(self, size, mean=0.0, std=1.0):
57 |         self.transform = Compose([
58 |             Resize(size),
59 |             SubtractMeans(mean),
60 |             lambda img, boxes=None, labels=None: (img / std, boxes, labels),
61 |             ToTensor()
62 |         ])
63 | 
64 |     def __call__(self, image):
65 |         image, _, _ = self.transform(image)
66 |         return image


--------------------------------------------------------------------------------
/dataset/transforms.py:
--------------------------------------------------------------------------------
  1 | # from https://github.com/amdegroot/ssd.pytorch
  2 | 
  3 | 
  4 | import torch
  5 | from torchvision import transforms
  6 | import cv2
  7 | import numpy as np
  8 | import types
  9 | from numpy import random
 10 | 
 11 | 
 12 | def intersect(box_a, box_b):
 13 |     max_xy = np.minimum(box_a[:, 2:], box_b[2:])
 14 |     min_xy = np.maximum(box_a[:, :2], box_b[:2])
 15 |     inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf)
 16 |     return inter[:, 0] * inter[:, 1]
 17 | 
 18 | 
 19 | def jaccard_numpy(box_a, box_b):
 20 |     """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
 21 |     is simply the intersection over union of two boxes.
 22 |     E.g.:
 23 |         A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
 24 |     Args:
 25 |         box_a: Multiple bounding boxes, Shape: [num_boxes,4]
 26 |         box_b: Single bounding box, Shape: [4]
 27 |     Return:
 28 |         jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]]
 29 |     """
 30 |     inter = intersect(box_a, box_b)
 31 |     area_a = ((box_a[:, 2]-box_a[:, 0]) *
 32 |               (box_a[:, 3]-box_a[:, 1]))  # [A,B]
 33 |     area_b = ((box_b[2]-box_b[0]) *
 34 |               (box_b[3]-box_b[1]))  # [A,B]
 35 |     union = area_a + area_b - inter
 36 |     return inter / union  # [A,B]
 37 | 
 38 | 
 39 | class Compose(object):
 40 |     """Composes several augmentations together.
 41 |     Args:
 42 |         transforms (List[Transform]): list of transforms to compose.
 43 |     Example:
 44 |         >>> augmentations.Compose([
 45 |         >>>     transforms.CenterCrop(10),
 46 |         >>>     transforms.ToTensor(),
 47 |         >>> ])
 48 |     """
 49 | 
 50 |     def __init__(self, transforms):
 51 |         self.transforms = transforms
 52 | 
 53 |     def __call__(self, img, boxes=None, labels=None):
 54 |         for t in self.transforms:
 55 |             img, boxes, labels = t(img, boxes, labels)
 56 |         return img, boxes, labels
 57 | 
 58 | 
 59 | class Lambda(object):
 60 |     """Applies a lambda as a transform."""
 61 | 
 62 |     def __init__(self, lambd):
 63 |         assert isinstance(lambd, types.LambdaType)
 64 |         self.lambd = lambd
 65 | 
 66 |     def __call__(self, img, boxes=None, labels=None):
 67 |         return self.lambd(img, boxes, labels)
 68 | 
 69 | 
 70 | class ConvertFromInts(object):
 71 |     def __call__(self, image, boxes=None, labels=None):
 72 |         return image.astype(np.float32), boxes, labels
 73 | 
 74 | 
 75 | class SubtractMeans(object):
 76 |     def __init__(self, mean):
 77 |         self.mean = np.array(mean, dtype=np.float32)
 78 | 
 79 |     def __call__(self, image, boxes=None, labels=None):
 80 |         image = image.astype(np.float32)
 81 |         image -= self.mean
 82 |         return image.astype(np.float32), boxes, labels
 83 | 
 84 | 
 85 | class ToAbsoluteCoords(object):
 86 |     def __call__(self, image, boxes=None, labels=None):
 87 |         height, width, channels = image.shape
 88 |         boxes[:, 0] *= width
 89 |         boxes[:, 2] *= width
 90 |         boxes[:, 1] *= height
 91 |         boxes[:, 3] *= height
 92 | 
 93 |         return image, boxes, labels
 94 | 
 95 | 
 96 | class ToPercentCoords(object):
 97 |     def __call__(self, image, boxes=None, labels=None):
 98 |         height, width, channels = image.shape
 99 |         boxes[:, 0] /= width
100 |         boxes[:, 2] /= width
101 |         boxes[:, 1] /= height
102 |         boxes[:, 3] /= height
103 | 
104 |         return image, boxes, labels
105 | 
106 | 
107 | class Resize(object):
108 |     def __init__(self, size=300):
109 |         self.size = size
110 | 
111 |     def __call__(self, image, boxes=None, labels=None):
112 |         image = cv2.resize(image, (self.size,
113 |                                  self.size))
114 |         return image, boxes, labels
115 | 
116 | 
117 | class RandomSaturation(object):
118 |     def __init__(self, lower=0.5, upper=1.5):
119 |         self.lower = lower
120 |         self.upper = upper
121 |         assert self.upper >= self.lower, "contrast upper must be >= lower."
122 |         assert self.lower >= 0, "contrast lower must be non-negative."
123 | 
124 |     def __call__(self, image, boxes=None, labels=None):
125 |         if random.randint(2):
126 |             image[:, :, 1] *= random.uniform(self.lower, self.upper)
127 | 
128 |         return image, boxes, labels
129 | 
130 | 
131 | class RandomHue(object):
132 |     def __init__(self, delta=18.0):
133 |         assert delta >= 0.0 and delta <= 360.0
134 |         self.delta = delta
135 | 
136 |     def __call__(self, image, boxes=None, labels=None):
137 |         if random.randint(2):
138 |             image[:, :, 0] += random.uniform(-self.delta, self.delta)
139 |             image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0
140 |             image[:, :, 0][image[:, :, 0] < 0.0] += 360.0
141 |         return image, boxes, labels
142 | 
143 | 
144 | class RandomLightingNoise(object):
145 |     def __init__(self):
146 |         self.perms = ((0, 1, 2), (0, 2, 1),
147 |                       (1, 0, 2), (1, 2, 0),
148 |                       (2, 0, 1), (2, 1, 0))
149 | 
150 |     def __call__(self, image, boxes=None, labels=None):
151 |         if random.randint(2):
152 |             swap = self.perms[random.randint(len(self.perms))]
153 |             shuffle = SwapChannels(swap)  # shuffle channels
154 |             image = shuffle(image)
155 |         return image, boxes, labels
156 | 
157 | 
158 | class ConvertColor(object):
159 |     def __init__(self, current, transform):
160 |         self.transform = transform
161 |         self.current = current
162 | 
163 |     def __call__(self, image, boxes=None, labels=None):
164 |         if self.current == 'BGR' and self.transform == 'HSV':
165 |             image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
166 |         elif self.current == 'RGB' and self.transform == 'HSV':
167 |             image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
168 |         elif self.current == 'BGR' and self.transform == 'RGB':
169 |             image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
170 |         elif self.current == 'HSV' and self.transform == 'BGR':
171 |             image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
172 |         elif self.current == 'HSV' and self.transform == "RGB":
173 |             image = cv2.cvtColor(image, cv2.COLOR_HSV2RGB)
174 |         else:
175 |             raise NotImplementedError
176 |         return image, boxes, labels
177 | 
178 | 
179 | class RandomContrast(object):
180 |     def __init__(self, lower=0.5, upper=1.5):
181 |         self.lower = lower
182 |         self.upper = upper
183 |         assert self.upper >= self.lower, "contrast upper must be >= lower."
184 |         assert self.lower >= 0, "contrast lower must be non-negative."
185 | 
186 |     # expects float image
187 |     def __call__(self, image, boxes=None, labels=None):
188 |         if random.randint(2):
189 |             alpha = random.uniform(self.lower, self.upper)
190 |             image *= alpha
191 |         return image, boxes, labels
192 | 
193 | 
194 | class RandomBrightness(object):
195 |     def __init__(self, delta=32):
196 |         assert delta >= 0.0
197 |         assert delta <= 255.0
198 |         self.delta = delta
199 | 
200 |     def __call__(self, image, boxes=None, labels=None):
201 |         if random.randint(2):
202 |             delta = random.uniform(-self.delta, self.delta)
203 |             image += delta
204 |         return image, boxes, labels
205 | 
206 | 
207 | class ToCV2Image(object):
208 |     def __call__(self, tensor, boxes=None, labels=None):
209 |         return tensor.cpu().numpy().astype(np.float32).transpose((1, 2, 0)), boxes, labels
210 | 
211 | 
212 | class ToTensor(object):
213 |     def __call__(self, cvimage, boxes=None, labels=None):
214 |         return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), boxes, labels
215 | 
216 | 
217 | class RandomSampleCrop(object):
218 |     """Crop
219 |     Arguments:
220 |         img (Image): the image being input during training
221 |         boxes (Tensor): the original bounding boxes in pt form
222 |         labels (Tensor): the class labels for each bbox
223 |         mode (float tuple): the min and max jaccard overlaps
224 |     Return:
225 |         (img, boxes, classes)
226 |             img (Image): the cropped image
227 |             boxes (Tensor): the adjusted bounding boxes in pt form
228 |             labels (Tensor): the class labels for each bbox
229 |     """
230 |     def __init__(self):
231 |         self.sample_options = (
232 |             # using entire original input image
233 |             None,
234 |             # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9
235 |             (0.1, None),
236 |             (0.3, None),
237 |             (0.7, None),
238 |             (0.9, None),
239 |             # randomly sample a patch
240 |             (None, None),
241 |         )
242 | 
243 |     def __call__(self, image, boxes=None, labels=None):
244 |         height, width, _ = image.shape
245 |         while True:
246 |             # randomly choose a mode
247 |             mode = random.choice(self.sample_options)
248 |             if mode is None:
249 |                 return image, boxes, labels
250 | 
251 |             min_iou, max_iou = mode
252 |             if min_iou is None:
253 |                 min_iou = float('-inf')
254 |             if max_iou is None:
255 |                 max_iou = float('inf')
256 | 
257 |             # max trails (50)
258 |             for _ in range(50):
259 |                 current_image = image
260 | 
261 |                 w = random.uniform(0.3 * width, width)
262 |                 h = random.uniform(0.3 * height, height)
263 | 
264 |                 # aspect ratio constraint b/t .5 & 2
265 |                 if h / w < 0.5 or h / w > 2:
266 |                     continue
267 | 
268 |                 left = random.uniform(width - w)
269 |                 top = random.uniform(height - h)
270 | 
271 |                 # convert to integer rect x1,y1,x2,y2
272 |                 rect = np.array([int(left), int(top), int(left+w), int(top+h)])
273 | 
274 |                 # calculate IoU (jaccard overlap) b/t the cropped and gt boxes
275 |                 overlap = jaccard_numpy(boxes, rect)
276 | 
277 |                 # is min and max overlap constraint satisfied? if not try again
278 |                 if overlap.min() < min_iou and max_iou < overlap.max():
279 |                     continue
280 | 
281 |                 # cut the crop from the image
282 |                 current_image = current_image[rect[1]:rect[3], rect[0]:rect[2],
283 |                                               :]
284 | 
285 |                 # keep overlap with gt box IF center in sampled patch
286 |                 centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0
287 | 
288 |                 # mask in all gt boxes that above and to the left of centers
289 |                 m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1])
290 | 
291 |                 # mask in all gt boxes that under and to the right of centers
292 |                 m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1])
293 | 
294 |                 # mask in that both m1 and m2 are true
295 |                 mask = m1 * m2
296 | 
297 |                 # have any valid boxes? try again if not
298 |                 if not mask.any():
299 |                     continue
300 | 
301 |                 # take only matching gt boxes
302 |                 current_boxes = boxes[mask, :].copy()
303 | 
304 |                 # take only matching gt labels
305 |                 current_labels = labels[mask]
306 | 
307 |                 # should we use the box left and top corner or the crop's
308 |                 current_boxes[:, :2] = np.maximum(current_boxes[:, :2],
309 |                                                   rect[:2])
310 |                 # adjust to crop (by substracting crop's left,top)
311 |                 current_boxes[:, :2] -= rect[:2]
312 | 
313 |                 current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:],
314 |                                                   rect[2:])
315 |                 # adjust to crop (by substracting crop's left,top)
316 |                 current_boxes[:, 2:] -= rect[:2]
317 | 
318 |                 return current_image, current_boxes, current_labels
319 | 
320 | 
321 | class Expand(object):
322 |     def __init__(self, mean):
323 |         self.mean = mean
324 | 
325 |     def __call__(self, image, boxes, labels):
326 |         if random.randint(2):
327 |             return image, boxes, labels
328 | 
329 |         height, width, depth = image.shape
330 |         ratio = random.uniform(1, 4)
331 |         left = random.uniform(0, width*ratio - width)
332 |         top = random.uniform(0, height*ratio - height)
333 | 
334 |         expand_image = np.zeros(
335 |             (int(height*ratio), int(width*ratio), depth),
336 |             dtype=image.dtype)
337 |         expand_image[:, :, :] = self.mean
338 |         expand_image[int(top):int(top + height),
339 |                      int(left):int(left + width)] = image
340 |         image = expand_image
341 | 
342 |         boxes = boxes.copy()
343 |         boxes[:, :2] += (int(left), int(top))
344 |         boxes[:, 2:] += (int(left), int(top))
345 | 
346 |         return image, boxes, labels
347 | 
348 | 
349 | class RandomMirror(object):
350 |     def __call__(self, image, boxes, classes):
351 |         _, width, _ = image.shape
352 |         if random.randint(2):
353 |             image = image[:, ::-1]
354 |             boxes = boxes.copy()
355 |             boxes[:, 0::2] = width - boxes[:, 2::-2]
356 |         return image, boxes, classes
357 | 
358 | 
359 | class SwapChannels(object):
360 |     """Transforms a tensorized image by swapping the channels in the order
361 |      specified in the swap tuple.
362 |     Args:
363 |         swaps (int triple): final order of channels
364 |             eg: (2, 1, 0)
365 |     """
366 | 
367 |     def __init__(self, swaps):
368 |         self.swaps = swaps
369 | 
370 |     def __call__(self, image):
371 |         """
372 |         Args:
373 |             image (Tensor): image tensor to be transformed
374 |         Return:
375 |             a tensor with channels swapped according to swap
376 |         """
377 |         # if torch.is_tensor(image):
378 |         #     image = image.data.cpu().numpy()
379 |         # else:
380 |         #     image = np.array(image)
381 |         image = image[:, :, self.swaps]
382 |         return image
383 | 
384 | 
385 | class PhotometricDistort(object):
386 |     def __init__(self):
387 |         self.pd = [
388 |             RandomContrast(),  # RGB
389 |             ConvertColor(current="RGB", transform='HSV'),  # HSV
390 |             RandomSaturation(),  # HSV
391 |             RandomHue(),  # HSV
392 |             ConvertColor(current='HSV', transform='RGB'),  # RGB
393 |             RandomContrast()  # RGB
394 |         ]
395 |         self.rand_brightness = RandomBrightness()
396 |         self.rand_light_noise = RandomLightingNoise()
397 | 
398 |     def __call__(self, image, boxes, labels):
399 |         im = image.copy()
400 |         im, boxes, labels = self.rand_brightness(im, boxes, labels)
401 |         if random.randint(2):
402 |             distort = Compose(self.pd[:-1])
403 |         else:
404 |             distort = Compose(self.pd[1:])
405 |         im, boxes, labels = distort(im, boxes, labels)
406 |         return self.rand_light_noise(im, boxes, labels)
407 | 
408 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.optim as optim
  4 | import torch.backends.cudnn as cudnn
  5 | import torch.nn.functional as F
  6 | import torchvision.transforms as transforms
  7 | import torchvision
  8 | from torch.nn import init
  9 | from torch.utils.data import DataLoader
 10 | from torch.autograd import Variable
 11 | 
 12 | import numpy as np
 13 | 
 14 | import visdom
 15 | import logging
 16 | 
 17 | import os, sys, time, io, datetime
 18 | import random
 19 | import argparse
 20 | 
 21 | from network.RefineDet import RefineDetArm, RefineDetObm
 22 | from network.FlowNet import FlowNetS
 23 | from network.network_utils import EmbeddingNetwork, L2Norm
 24 | from network.MultiBoxLoss import RefineMultiBoxLoss
 25 | from dataset.bosch import BoschTrainDetection, BoschTestDetection, detection_collate
 26 | from dataset.data_preprocessing import TrainAugmentation, TestTransform
 27 | from network.prior_box import PriorBox
 28 | 
 29 | def reindex_tensor(input_tensor, input_index):
 30 |      if isinstance(input_index, list):
 31 |          input_index = torch.Tensor(input_index)
 32 |      elif isinstance(input_index, np.array):
 33 |          input_index = torch.from_numpy(input_index)
 34 |      input_index = input_index.long()
 35 |      per_batch_length = np.prod(np.array(input_tensor.size())[1: ])
 36 |      expand_index = input_index.unsqueeze(-1).repeat(1, per_batch_length).view(input_tensor.size())
 37 |      return torch.gather(input_tensor, 0, expand_index)
 38 | 
 39 | def reindex(array, index):
 40 |     return [array[item] for item in index]
 41 | 
 42 | def str2bool(v):
 43 |     return v.lower() in ("yes", "true", "1", "t")
 44 | 
 45 | """
 46 | def cosine_similarity(preceed, current):
 47 |     preceed_vector = EmbeddingNetwork(preceed)
 48 |     current_vector = EmbeddingNetwork(current)
 49 |     preceed_vector_sum_sqrt = torch.sqrt(torch.sum(torch.pow(preceed_vector, 2), 1))
 50 |     current_vector_sum_sqrt = torch.sqrt(torch.sum(torch.pow(current_vector, 2), 1))
 51 | 
 52 |     return torch.exp(preceed_vector * current_vector / (preceed_vector_sum_sqrt * current_vector_sum_sqrt))
 53 | """
 54 | 
 55 | # input: [batch_size, channels, height, width], gather_index: [batch_index, 2, height, width]
 56 | def gather_nd(input, gather_index):
 57 |     input.cuda()
 58 |     gather_index.cuda()
 59 |     base_index_x, base_index_y = torch.meshgrid([torch.arange(input.size()[2]), torch.arange(input.size()[3])])
 60 |     base_index = torch.stack([base_index_x, base_index_y], -1).view(input.size()[2], input.size()[3], 2)
 61 |     base_index = torch.stack([base_index for _ in range(input.size()[0])]).double()
 62 | 
 63 |     input = input.permute(0, 2, 3, 1).contiguous().double()
 64 |     gather_index = gather_index.permute(0, 2, 3, 1).contiguous().double()
 65 |     gather_index = base_index + gather_index
 66 |     gather_index = gather_index.view(-1, 2).double()
 67 |     clamp_gather_index = torch.DoubleTensor(gather_index.size()).cuda()
 68 |     clamp_gather_index[:, 0] = torch.clamp(gather_index[:, 0], 0., float(input.size()[1] - 1)).double()
 69 |     clamp_gather_index[:, 1] = torch.clamp(gather_index[:, 1], 0., float(input.size()[2] - 1)).double()
 70 |     gather_index_ceil = torch.ceil(clamp_gather_index).double()
 71 |     gather_index_floor = torch.floor(clamp_gather_index).double()
 72 | 
 73 |     output = []
 74 |     for i in range(gather_index.size()[0]):
 75 |         batch_index = i // (input.size()[1] * input.size()[1])
 76 | 
 77 |         cor_x, cor_y = clamp_gather_index[i][0], clamp_gather_index[i][1]
 78 |         cor_x_ceil, cor_y_ceil = gather_index_ceil[i][0], gather_index_ceil[i][1]
 79 |         cor_x_floor, cor_y_floor = gather_index_floor[i][0], gather_index_floor[i][1]
 80 |         weight_ceil_x, weight_ceil_y = cor_x - cor_x_floor, cor_y - cor_y_floor
 81 |         weight_floor_x, weight_floor_y = cor_x_ceil - cor_x, cor_y_ceil - cor_y
 82 | 
 83 |         output_ceil = input[batch_index, cor_x_ceil.int(), cor_y_ceil.int()]
 84 |         output_floor = input[batch_index, cor_x_floor.int(), cor_y_floor.int()]
 85 |         output_y_ceil = weight_ceil_x * input[batch_index, cor_x_ceil.int(), cor_y_ceil.int()] + weight_floor_x * input[batch_index, cor_x_floor.int(), cor_y_ceil.int()]
 86 |         output_y_floor = weight_ceil_x * input[batch_index, cor_x_ceil.int(), cor_y_floor.int()] + weight_floor_x * input[batch_index, cor_x_floor.int(), cor_y_floor.int()]
 87 |         output.append(weight_ceil_y * output_y_ceil + weight_floor_y * output_y_floor)
 88 | 
 89 |     result = torch.stack(output, 0).view(tuple(input.size())).permute(0, 3, 1, 2).contiguous().float()
 90 | 
 91 |     return result
 92 | 
 93 | # TODO: set batch_size=1 may cause error from cosine similarity part, check unsqueeze afterwards
 94 | parser = argparse.ArgumentParser()
 95 | 
 96 | parser.add_argument('-v', '--version', default='RefineDet', help='feature network')
 97 | parser.add_argument('-s', '--size', default=320, help='320 or 512 input size')
 98 | parser.add_argument('-d', '--dataset', default='Bosch', help='Cityscapes, ImageNet VID, Bosch or Sensetime')
 99 | parser.add_argument('-b', '--batch_size', default=4, type=int, help='batch size')
100 | 
101 | parser.add_argument('--jaccard_threshold', default=0.5, type=float, help='min jaccard index for matching')
102 | parser.add_argument('--num_workers', default=0, type=int, help='number of workers in dataloading')
103 | parser.add_argument('--cuda', default=True, type=bool, help='use cuda')
104 | parser.add_argument('--gpu_id', default=0, type=int, help='gpu list')
105 | parser.add_argument('--lr', '--learning_rate ', default=1e-4, help='initial learning rate')
106 | parser.add_argument('--base_lr', default=1e-8, help='base feature network learning rate')
107 | parser.add_argument('--flownet_lr', default=1e-8, help='flownet learning rate')
108 | parser.add_argument('--momentum', default=0.9, type=float, help='momentum')
109 | 
110 | parser.add_argument('--feature_basenet', default="models/vgg16_reducedfc.pth", help='feature network pretrained model')
111 | parser.add_argument('--flow_basenet', default="models/FlowNet2-S_checkpoint.pth.tar", help='flow network pretrained model')
112 | parser.add_argument('--dataset_path', default="/mnt/lustre/zhoukeyang/Bosch_Dataset")
113 | 
114 | parser.add_argument('--debug_step', default=100, help='debug step')
115 | parser.add_argument('--q_threshold', default=0., help='q threshold')
116 | parser.add_argument('--use_aggr', default=True, help='use aggregation')
117 | parser.add_argument('--use_partial', default=True, help='use partial feature updating')
118 | parser.add_argument('--mask_loss_weight', default=1., type=float, help='update mask loss weight')
119 | 
120 | parser.add_argument('--resume', default=False, help='resume net for retraining')
121 | parser.add_argument('--resume_epoch', default=0, type=int, help='resume iteration for retraining')
122 | 
123 | parser.add_argument('--max_epoch', default=300, type=int, help='max epoch for retraining')
124 | parser.add_argument('--valid_iter', default=1, type=int, help='epoch that print the loss')
125 | parser.add_argument('--save_folder', default='models', type=str, help='location to save checkpoint')
126 | parser.add_argument('--save_iter', default=3, type=int, help='epoch that save the model')
127 | parser.add_argument('--visdom', default=False, help='use visualization')
128 | parser.add_argument('--num_classes', default=14, help='num classes')
129 | 
130 | parser.add_argument('--is_training', default=True, type=bool, help='training or validating')
131 | 
132 | args = parser.parse_args()
133 | logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
134 | DEVICE = torch.device("cuda:0" if torch.cuda.is_available() and args.cuda else "cpu")
135 | 
136 | VOC_320 = {
137 |     'feature_maps': [40, 20, 10, 5],
138 |     'min_dim': 320,
139 |     'steps': [8, 16, 32, 64],
140 |     'min_sizes': [32, 64, 128, 256],
141 |     'max_sizes': [],
142 |     'aspect_ratios': [[2], [2], [2], [2]],
143 |     'variance': [0.1, 0.2],
144 |     'clip': True,
145 | }
146 | 
147 | 
148 | def test(dataloader, refinedet_arm, refinedet_obm, flownetS, arm_criterion, obm_criterion, device):
149 |     refinedet_arm.eval(True)
150 |     refinedet_obm.eval(True)
151 |     flownetS.eval(True)
152 | 
153 | 
154 | def train(dataloader, refinedet_arm, refinedet_obm, flownetS, arm_criterion, obm_criterion, optimizer, device, epoch):
155 |     refinedet_arm.train(True)
156 |     refinedet_obm.train(True)
157 |     flownetS.train(True)
158 | 
159 |     previous_images, previous_arm_conf, previous_arm_loc = None, None, None
160 |     preceed_out, preceed_featuremap, preceed_conf, preceed_loc, preceed_input = [], [], [], [], []
161 | 
162 |     running_loss = 0.
163 |     running_arm_regression_loss = 0.
164 |     running_arm_classification_loss = 0.
165 |     running_obm_regression_loss = 0.
166 |     running_obm_classification_loss = 0.
167 |     running_updating_mask_loss = 0.
168 | 
169 |     for i, data in enumerate(dataloader):
170 |         previous_images, previous_arm_conf, previous_arm_loc = None, None, None
171 |         preceed_out, preceed_featuremap, preceed_conf, preceed_loc, preceed_input = [], [], [], [], []
172 | 
173 |         corresponding_key = []
174 |         for batch_index in range(args.batch_size):
175 |             corresponding_key.append(random.randint(0, batch_index))
176 |         images, targets = data[0], data[1]
177 |         images = images.to(device)
178 |         #boxes = boxes.to(device)
179 |         #labels = labels.to(device)
180 |         preceed_input = images
181 | 
182 |         #preceed_images = torch.stack((torch.unbind(preceed_images, 0)[corresponding_key]), 0).to(device)
183 |         #preceed_images = torch.stack(reindex(torch.unbind(preceed_input, 0), corresponding_key), 0).to(device)
184 |         preceed_images = reindex_tensor(preceed_input, corresponding_key)
185 | 
186 |         optimizer.zero_grad()
187 | 
188 |         # arm_sources 512, 512, 1024, 512
189 |         out, arm_sources, arm_conf, arm_loc = refinedet_arm(images)
190 |         images_stack = torch.cat((images, preceed_images), 1)
191 |         images_stack = F.interpolate(images_stack, size=(256, 256), mode='bilinear')
192 |         flow_result, flow_list, q_propagate = flownetS(images_stack)
193 | 
194 |         preceed_out.append(out)
195 |         preceed_featuremap.append(arm_sources)
196 |         preceed_conf = arm_conf
197 |         preceed_loc = arm_loc
198 |         arm_out = out
199 | 
200 |         updating_mask = torch.zeros(size=(1, ))
201 | 
202 |         # enforce q_propagate equals to 1 and 0 with 1/3 probability add later...
203 |         if args.use_partial:
204 |             updating_mask = torch.clamp(q_propagate - args.q_threshold + 0.5, 0., 1.)
205 |             prop_condition = 1 - updating_mask
206 |             prop_condition = prop_condition.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
207 | 
208 |             new_arm_sources = []
209 | 
210 |             for j, arm_source_item in enumerate(arm_sources):
211 |                 flow_map = F.interpolate(flow_result, size=tuple(arm_source_item.size())[2:], mode='bilinear') * float(arm_source_item.size()[2] / flow_result.size()[2])
212 |                 #flow_map *= float(arm_source_item.size()[2] / flow_result.size()[2])
213 |                 new_preceed = gather_nd(reindex_tensor(arm_source_item, corresponding_key), flow_map)
214 |                 new_arm_sources.append(prop_condition * new_preceed + (1. - prop_condition) * arm_source_item)
215 |                 #new_arm_sources.append(prop_condition * gather_nd(torch.stack(reindex(torch.unbind(arm_source_item, 0), corresponding_key), 0), flow_map) + (1. - prop_condition) * arm_source_item)
216 |             """
217 |             arm_conf_original_shape = arm_conf.size()
218 |             arm_loc_original_shape = arm_loc.size()
219 | 
220 |             arm_conf_shape = list(flow_result.size())
221 |             arm_conf_shape[1] = 2
222 |             arm_loc_shape = list(flow_result.size())
223 |             arm_loc_shape[1] = 2
224 | 
225 |             arm_conf = arm_conf.view(arm_conf_shape)
226 |             arm_loc = arm_loc.view(arm_loc_shape)
227 |             arm_conf = prop_condition * gather_nd(torch.stack(reindex(torch.unbind(arm_conf, 0), corresponding_key), 0), flow_result) + (1. - prop_condition) * arm_conf
228 |             arm_loc = prop_condition * gather_nd(torch.stack(reindex(torch.unbind(arm_loc, 0), corresponding_key), 0), flow_result) + (1. - prop_condition) * arm_loc
229 | 
230 |             arm_conf = arm_conf.view(arm_conf_original_shape)
231 |             arm_loc = arm_loc.view(arm_loc_original_shape)
232 |             """
233 |             arm_out = new_arm_sources[-1]
234 |             arm_sources = new_arm_sources
235 | 
236 |         # warp operation add later...
237 |         if args.use_aggr:
238 |             def cosine_similarity(preceed, current, Embedding):
239 |                 preceed_vector = Embedding(preceed)
240 |                 current_vector = Embedding(current)
241 |                 preceed_vector_sum_sqrt = torch.sqrt(torch.sum(torch.pow(preceed_vector, 2), -1))
242 |                 current_vector_sum_sqrt = torch.sqrt(torch.sum(torch.pow(current_vector, 2), -1))
243 | 
244 |                 return torch.exp(torch.sum(preceed_vector * current_vector, 1) / (preceed_vector_sum_sqrt * current_vector_sum_sqrt)).float()
245 | 
246 |             preceed = []
247 |             new_arm_sources = []
248 |             new_preceed = []
249 |             for item in arm_sources:
250 |                 #preceed.append(torch.stack(reindex(torch.unbind(item, 0), corresponding_key), 0))
251 |                 preceed.append(reindex_tensor(item, corresponding_key))
252 |             succeed = arm_sources
253 | 
254 |             # wrap key frame using flow result
255 |             for j, preceed_map in enumerate(preceed):
256 |                 flow_map = F.interpolate(flow_result, size=tuple(preceed_map.size())[2: ], mode='bilinear')
257 |                 flow_map *= (preceed_map.size()[2] / flow_result.size()[2])
258 | 
259 |                 propagate_similarity = cosine_similarity(preceed[j].float(), succeed[j].float(), EmbeddingNetwork[j])
260 |                 propagate_similarity = propagate_similarity / (1. + propagate_similarity)
261 |                 self_similarity = 1. - propagate_similarity
262 |                 propagate_similarity, self_similarity = torch.mean(propagate_similarity), torch.mean(self_similarity)
263 | 
264 |                 new_preceed.append(gather_nd(preceed_map, flow_map))
265 |                 new_arm_sources.append(self_similarity * succeed[j] + propagate_similarity * preceed[j])
266 |             arm_out = new_arm_sources[-1]
267 |             arm_sources = new_arm_sources
268 | 
269 |         obm_out = refinedet_obm(arm_out, arm_sources, arm_conf, arm_loc, is_training=True)
270 |         feature_layer, arm_conf, arm_loc, obm_conf, obm_loc = obm_out
271 | 
272 |         #labels = torch.unsqueeze(labels, -1)
273 |         #arm_targets = torch.cat((boxes, torch.gt(labels, 0).float()), -1)
274 |         #obm_targets = torch.cat((boxes, labels.float()), -1)
275 |         arm_regression_loss, arm_classification_loss = arm_criterion((arm_loc, arm_conf), priors, targets)
276 |         obm_regression_loss, obm_classification_loss = obm_criterion((obm_loc, obm_conf), priors, targets, (arm_loc, arm_conf), False)
277 | 
278 |         arm_detection_loss = (arm_regression_loss + arm_classification_loss).double()
279 |         obm_detection_loss = (obm_regression_loss + obm_classification_loss).double()
280 |         update_mask_loss = torch.sum(updating_mask).double()
281 |         loss = arm_detection_loss + obm_detection_loss + update_mask_loss
282 | 
283 |         #loss.backward()
284 |         #optimizer.step()
285 | 
286 |         running_loss += loss.item()
287 |         running_arm_regression_loss += arm_regression_loss.item()
288 |         running_arm_classification_loss += arm_classification_loss.item()
289 |         running_obm_regression_loss += obm_regression_loss.item()
290 |         running_obm_classification_loss += obm_classification_loss.item()
291 |         running_updating_mask_loss += update_mask_loss.item()
292 | 
293 |         if i and i % args.debug_step == 0:
294 |             avg_loss = running_loss / args.debug_step
295 |             avg_arm_reg_loss = running_arm_regression_loss / args.debug_step
296 |             avg_arm_clf_loss = running_arm_classification_loss / args.debug_step
297 |             avg_obm_reg_loss = running_obm_regression_loss / args.debug_step
298 |             avg_obm_clf_loss = running_obm_classification_loss / args.debug_step
299 |             avg_update_mask_loss = running_updating_mask_loss / args.debug_step
300 |             print("Epoch: {}, Step: {}, Avg loss: {}, Avg arm loss: {}, Avg obm loss: {}, Update mask loss: {}"
301 |                   .format(epoch, i, avg_loss, avg_arm_reg_loss + avg_arm_clf_loss, avg_obm_reg_loss + avg_obm_clf_loss, avg_update_mask_loss))
302 |             running_loss, running_arm_regression_loss, running_arm_classification_loss, running_obm_regression_loss, running_obm_classification_loss, running_updating_mask_loss = 0., 0., 0., 0.
303 | 
304 | 
305 | if __name__ == "__main__":
306 |     save_folder = args.save_folder + "_" + datetime.datetime.now().strftime('%Y%m%d%H%M%S')
307 |     if not os.path.exists(save_folder):
308 |         os.mkdir(save_folder)
309 | 
310 |     if args.visdom:
311 |         viz = visdom.Visdom()
312 | 
313 |     cfg = VOC_320
314 |     priorbox = PriorBox(cfg)
315 |     priors = priorbox.forward()
316 |     #priors = Variable(priorbox.forward(), volatile=True)
317 | 
318 |     train_transform = TrainAugmentation(args.size, np.array([123, 117, 104]), 128.)
319 |     train_dataset = BoschTrainDetection(root_dir=args.dataset_path, \
320 |                                   yaml_file="train.yaml", transform=train_transform, target_transform=False)
321 | 
322 |     print(len(train_dataset))
323 | 
324 |     label_file = os.path.join(save_folder, "bosch-dataset-labels.txt")
325 |     with open(label_file, "w") as f:
326 |         f.write("\n".join(train_dataset.class_names))
327 |     num_classes = len(train_dataset.class_names)
328 | 
329 |     refinedet_arm = RefineDetArm(vgg_type='300', in_channels=3, batch_norm=args.is_training)
330 |     refinedet_obm = RefineDetObm(num_classes=num_classes, batch_norm=args.is_training)
331 |     flownetS = FlowNetS(in_channels=6, is_training=True)
332 | 
333 |     EmbeddingCosine1 = EmbeddingNetwork(512)
334 |     EmbeddingCosine2 = EmbeddingNetwork(512)
335 |     EmbeddingCosine3 = EmbeddingNetwork(1024)
336 |     EmbeddingCosine4 = EmbeddingNetwork(512)
337 | 
338 |     EmbeddingNetwork = [EmbeddingCosine1, EmbeddingCosine2, EmbeddingCosine3, EmbeddingCosine4]
339 | 
340 |     total_net = nn.ModuleList(
341 |         [
342 |             refinedet_arm,
343 |             refinedet_obm,
344 |             flownetS,
345 |         ]
346 |     )
347 | 
348 |     train_dataloader = DataLoader(train_dataset, args.batch_size, num_workers=args.num_workers, shuffle=False, collate_fn=detection_collate)
349 | 
350 |     if not args.resume:
351 |         def xavier(param):
352 |             init.xavier_uniform(param)
353 | 
354 |         def weights_init(m):
355 |             for key in m.state_dict():
356 |                 if key.split('.')[-1] == 'weight':
357 |                     if 'conv' in key:
358 |                         init.kaiming_normal(m.state_dict()[key], mode='fan_out')
359 |                     if 'bn' in key:
360 |                         m.state_dict()[key][...] = 1
361 |                 elif key.split('.')[-1] == 'bias':
362 |                     m.state_dict()[key][...] = 0
363 | 
364 |         refinedet_arm.apply(weights_init)
365 |         refinedet_obm.apply(weights_init)
366 |         flownetS.apply(weights_init)
367 |         for item in EmbeddingNetwork:
368 |             item.apply(weights_init)
369 | 
370 |         feature_weight = torch.load(args.feature_basenet)
371 |         flow_weight = torch.load(args.flow_basenet)['state_dict']
372 | 
373 |         refinedet_arm.vgg_list.load_state_dict(feature_weight)
374 | 
375 |         # select and restore parameters partly
376 |         flownet_dict = {}
377 |         for k, v in flow_weight.items():
378 |             if 'conv' in k:
379 |                 new_k = k.split('.')[0] + '.' + k.split('.')[-1]
380 |                 if 'deconv' not in k and 'bias' in k:
381 |                     continue
382 |             elif 'upsample' in k:
383 |                 new_k = k.split('_')[0] + '_' + k.split('_')[1] + '_' + k.split('_')[3]
384 |             else:
385 |                 new_k = k
386 |             flownet_dict[new_k] = v
387 | 
388 |         flownet_dict['q_propagate.weight'] = flownetS.state_dict()['q_propagate.weight']
389 |         flownet_stat_dict = flownetS.state_dict()
390 |         flownet_stat_dict.update(flownet_dict)
391 |         flownetS.load_state_dict(flownet_stat_dict)
392 | 
393 |         vgg_pretrained_list = []
394 |         flownet_pretrained_list = []
395 |         random_list = []
396 |         for name, param in list(total_net.named_parameters()):
397 |             if 'vgg_list' in name:
398 |                 vgg_pretrained_list.append(param)
399 |             elif 'predict_flow' in name:
400 |                 flownet_pretrained_list.append(param)
401 |             else:
402 |                 random_list.append(param)
403 | 
404 |     else:
405 |         resume_path = os.path.join(save_folder, args.resume_epoch + ".pth")
406 |         state_dict = torch.load(resume_path)
407 |         from collections import OrderedDict
408 | 
409 |         new_state_dict = OrderedDict()
410 |         for k, v in state_dict.items():
411 |             if k[: 7] == "module.":
412 |                 name = k[7: ]
413 |             else:
414 |                 name = k
415 |             new_state_dict[name] = v
416 |         total_net.load_state_dict(new_state_dict)
417 | 
418 |     if args.gpu_id:
419 |         refinedet_arm = torch.nn.DataParallel(refinedet_arm, device_ids=args.gpu_id)
420 |         refinedet_obm = torch.nn.DataParallel(refinedet_obm, device_ids=args.gpu_id)
421 |         flownetS = torch.nn.DataParallel(flownetS, device_ids=args.gpu_id)
422 | 
423 |     if args.cuda:
424 |         refinedet_arm.cuda()
425 |         refinedet_obm.cuda()
426 |         flownetS.cuda()
427 |         cudnn.benchmark = True
428 | 
429 |     optimizer = optim.Adam(
430 |         [
431 |             {"params": vgg_pretrained_list, "lr": args.base_lr},
432 |             {"params": flownet_pretrained_list, "lr": args.flownet_lr},
433 |             {"params": random_list}
434 |         ], lr=args.lr)
435 |     arm_criterion = RefineMultiBoxLoss(2, 0.5, True, 0, True, 3, 0.5, False)
436 |     obm_criterion = RefineMultiBoxLoss(num_classes, 0.5, True, 0, True, 3, 0.5, False)
437 | 
438 |     priors = torch.Tensor(priors.astype(np.float32)).cpu()
439 | 
440 |     for epoch in range(args.resume_epoch, args.max_epoch):
441 |         train(train_dataloader, refinedet_arm, refinedet_obm, flownetS, arm_criterion, obm_criterion, optimizer, DEVICE, epoch)
442 |         torch.save(total_net.state_dict(), os.path.join(save_folder, "Epoch_" + str(epoch) + ".pth"))
443 | 
444 | 


--------------------------------------------------------------------------------
/models/bosch-dataset-labels.txt:
--------------------------------------------------------------------------------
 1 | None
 2 | Green
 3 | GreenStraightLeft
 4 | GreenLeft
 5 | RedLeft
 6 | GreenStraightRight
 7 | Red
 8 | off
 9 | GreenRight
10 | GreenStraight
11 | Yellow
12 | RedRight
13 | RedStraight
14 | RedStraightLeft


--------------------------------------------------------------------------------
/network/FlowNet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import numpy as np
  4 | from torch.nn import init
  5 | 
  6 | 
  7 | def conv(in_channels, out_channels, kernel_size=3, stride=1, is_training=True):
  8 |     if is_training:
  9 |         return nn.Sequential(
 10 |             nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=(kernel_size - 1) // 2, bias=False),
 11 |             nn.BatchNorm2d(out_channels),
 12 |             nn.LeakyReLU(0.1, inplace=True),
 13 |         )
 14 |     else:
 15 |         return nn.Sequential(
 16 |             nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=(kernel_size - 1) // 2, bias=True),
 17 |             nn.LeakyReLU(0.1, inplace=True),
 18 |         )
 19 | 
 20 | 
 21 | class FlowNetS(nn.Module):
 22 |     def __init__(self, in_channels=6, is_training=True):
 23 |         super(FlowNetS, self).__init__()
 24 | 
 25 |         self.is_training = is_training
 26 |         self.in_channels = in_channels
 27 | 
 28 |         self.conv1 = nn.Conv2d(in_channels, 64, kernel_size=7, stride=2, padding=3, bias=(not is_training))
 29 |         self.batch_norm_1 = nn.BatchNorm2d(64)
 30 |         self.leaky_relu_1 = nn.LeakyReLU(0.1, inplace=True)
 31 |         self.conv2 = nn.Conv2d(64, 128, kernel_size=5, stride=2, padding=2, bias=(not is_training))
 32 |         self.batch_norm_2 = nn.BatchNorm2d(128)
 33 |         self.leaky_relu_2 = nn.LeakyReLU(0.1, inplace=True)
 34 |         self.conv3 = nn.Conv2d(128, 256, kernel_size=5, stride=2, padding=2, bias=(not is_training))
 35 |         self.batch_norm_3 = nn.BatchNorm2d(256)
 36 |         self.leaky_relu_3 = nn.LeakyReLU(0.1, inplace=True)
 37 |         self.conv3_1 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1, bias=(not is_training))
 38 |         self.batch_norm_3_1 = nn.BatchNorm2d(256)
 39 |         self.leaky_relu_3_1 = nn.LeakyReLU(0.1, inplace=True)
 40 |         self.conv4 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=(not is_training))
 41 |         self.batch_norm_4 = nn.BatchNorm2d(512)
 42 |         self.leaky_relu_4 = nn.LeakyReLU(0.1, inplace=True)
 43 |         self.conv4_1 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1, bias=(not is_training))
 44 |         self.batch_norm_4_1 = nn.BatchNorm2d(512)
 45 |         self.leaky_relu_4_1 = nn.LeakyReLU(0.1, inplace=True)
 46 |         self.conv5 = nn.Conv2d(512, 512, kernel_size=3, stride=2, padding=1, bias=(not is_training))
 47 |         self.batch_norm_5 = nn.BatchNorm2d(512)
 48 |         self.leaky_relu_5 = nn.LeakyReLU(0.1, inplace=True)
 49 |         self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1, bias=(not is_training))
 50 |         self.batch_norm_5_1 = nn.BatchNorm2d(512)
 51 |         self.leaky_relu_5_1 = nn.LeakyReLU(0.1, inplace=True)
 52 |         self.conv6 = nn.Conv2d(512, 1024, kernel_size=3, stride=2, padding=1, bias=(not is_training))
 53 |         self.batch_norm_6 = nn.BatchNorm2d(1024)
 54 |         self.leaky_relu_6 = nn.LeakyReLU(0.1, inplace=True)
 55 |         self.conv6_1 = nn.Conv2d(1024, 1024, kernel_size=3, stride=1, padding=1, bias=(not is_training))
 56 |         self.batch_norm_6_1 = nn.BatchNorm2d(1024)
 57 |         self.leaky_relu_6_1 = nn.LeakyReLU(0.1, inplace=True)
 58 | 
 59 |         self.deconv5 = nn.ConvTranspose2d(1024, 512, kernel_size=4, stride=2, padding=1, bias=True)
 60 |         self.deconv4 = nn.ConvTranspose2d(1026, 256, kernel_size=4, stride=2, padding=1, bias=True)
 61 |         self.deconv3 = nn.ConvTranspose2d(770, 128, kernel_size=4, stride=2, padding=1, bias=True)
 62 |         self.deconv2 = nn.ConvTranspose2d(386, 64, kernel_size=4, stride=2, padding=1, bias=True)
 63 | 
 64 |         self.predict_flow6 = nn.Conv2d(1024, 2, kernel_size=3, stride=1, padding=1, bias=True)
 65 |         self.predict_flow5 = nn.Conv2d(1026, 2, kernel_size=3, stride=1, padding=1, bias=True)
 66 |         self.predict_flow4 = nn.Conv2d(770, 2, kernel_size=3, stride=1, padding=1, bias=True)
 67 |         self.predict_flow3 = nn.Conv2d(386, 2, kernel_size=3, stride=1, padding=1, bias=True)
 68 |         self.predict_flow2 = nn.Conv2d(194, 2, kernel_size=3, stride=1, padding=1, bias=True)
 69 | 
 70 |         self.upsampled_flow6_5 = nn.ConvTranspose2d(2, 2, kernel_size=4, stride=2, padding=1, bias=False)
 71 |         self.upsampled_flow5_4 = nn.ConvTranspose2d(2, 2, kernel_size=4, stride=2, padding=1, bias=False)
 72 |         self.upsampled_flow4_3 = nn.ConvTranspose2d(2, 2, kernel_size=4, stride=2, padding=1, bias=False)
 73 |         self.upsampled_flow3_2 = nn.ConvTranspose2d(2, 2, kernel_size=4, stride=2, padding=1, bias=False)
 74 | 
 75 |         for m in self.modules():
 76 |             if isinstance(m, nn.Conv2d):
 77 |                 if m.bias is not None:
 78 |                     init.uniform_(m.bias)
 79 |                 init.xavier_uniform_(m.weight)
 80 | 
 81 |             if isinstance(m, nn.ConvTranspose2d):
 82 |                 if m.bias is not None:
 83 |                     init.uniform_(m.bias)
 84 |                 init.xavier_uniform_(m.weight)
 85 | 
 86 |         self.q_propagate = nn.Conv2d(2, 1, kernel_size=3, stride=1, padding=1, bias=False)
 87 |         self.upsampled_q6_1 = nn.Upsample(scale_factor=64, mode='bilinear')
 88 |         self.upsampled_q5_1 = nn.Upsample(scale_factor=32, mode='bilinear')
 89 |         self.upsampled_q4_1 = nn.Upsample(scale_factor=16, mode='bilinear')
 90 |         self.upsampled_q3_1 = nn.Upsample(scale_factor=8, mode='bilinear')
 91 |         self.upsampled_q2_1 = nn.Upsample(scale_factor=4, mode='bilinear')
 92 |         self.upsampled_flow2_1 = nn.Upsample(scale_factor=4, mode='bilinear')
 93 | 
 94 |     def forward(self, x):
 95 |         #input = nn.functional.interpolate(x, size=(256, 256), mode='bilinear')
 96 | 
 97 |         out_conv1 = self.conv1(x)
 98 |         if self.is_training:
 99 |             out_conv1 = self.leaky_relu_1(self.batch_norm_1(out_conv1))
100 |         else:
101 |             out_conv1 = self.leaky_relu_1(out_conv1)
102 | 
103 |         out_conv2 = self.conv2(out_conv1)
104 |         if self.is_training:
105 |             out_conv2 = self.leaky_relu_2(self.batch_norm_2(out_conv2))
106 |         else:
107 |             out_conv2 = self.leaky_relu_2(out_conv2)
108 | 
109 |         out_conv3 = self.conv3(out_conv2)
110 |         if self.is_training:
111 |             out_conv3 = self.leaky_relu_3(self.batch_norm_3(out_conv3))
112 |         else:
113 |             out_conv3 = self.leaky_relu_3(out_conv3)
114 | 
115 |         out_conv3 = self.conv3_1(out_conv3)
116 |         if self.is_training:
117 |             out_conv3 = self.leaky_relu_3_1(self.batch_norm_3_1(out_conv3))
118 |         else:
119 |             out_conv3 = self.leaky_relu_3_1(out_conv3)
120 | 
121 |         out_conv4 = self.conv4(out_conv3)
122 |         if self.is_training:
123 |             out_conv4 = self.leaky_relu_4(self.batch_norm_4(out_conv4))
124 |         else:
125 |             out_conv4 = self.leaky_relu_4(out_conv4)
126 | 
127 |         out_conv4 = self.conv4_1(out_conv4)
128 |         if self.is_training:
129 |             out_conv4 = self.leaky_relu_4_1(self.batch_norm_4_1(out_conv4))
130 |         else:
131 |             out_conv4 = self.leaky_relu_4_1(out_conv4)
132 | 
133 |         out_conv5 = self.conv5(out_conv4)
134 |         if self.is_training:
135 |             out_conv5 = self.leaky_relu_5(self.batch_norm_5(out_conv5))
136 |         else:
137 |             out_conv5 = self.leaky_relu_5(out_conv5)
138 | 
139 |         out_conv5 = self.conv5_1(out_conv5)
140 |         if self.is_training:
141 |             out_conv5 = self.leaky_relu_5_1(self.batch_norm_5_1(out_conv5))
142 |         else:
143 |             out_conv5 = self.leaky_relu_5_1(out_conv5)
144 | 
145 |         out_conv6 = self.conv6(out_conv5)
146 |         if self.is_training:
147 |             out_conv6 = self.leaky_relu_6(self.batch_norm_6(out_conv6))
148 |         else:
149 |             out_conv6 = self.leaky_relu_6(out_conv6)
150 | 
151 |         out_conv6 = self.conv6_1(out_conv6)
152 |         if self.is_training:
153 |             out_conv6 = self.leaky_relu_6_1(self.batch_norm_6_1(out_conv6))
154 |         else:
155 |             out_conv6 = self.leaky_relu_6_1(out_conv6)
156 | 
157 | 
158 |         flow6 = self.predict_flow6(out_conv6)
159 |         flow6_up = self.upsampled_flow6_5(flow6)
160 |         out_deconv5 = self.deconv5(out_conv6)
161 |         q_propagate6 = self.upsampled_q6_1(self.q_propagate(flow6))
162 | 
163 |         concat5 = torch.cat((out_conv5, out_deconv5, flow6_up), 1)
164 |         flow5 = self.predict_flow5(concat5)
165 |         flow5_up = self.upsampled_flow5_4(flow5)
166 |         out_deconv4 = self.deconv4(concat5)
167 |         q_propagate5 = self.upsampled_q5_1(self.q_propagate(flow5))
168 | 
169 |         concat4 = torch.cat((out_conv4, out_deconv4, flow5_up), 1)
170 |         flow4 = self.predict_flow4(concat4)
171 |         flow4_up = self.upsampled_flow4_3(flow4)
172 |         out_deconv3 = self.deconv3(concat4)
173 |         q_propagate4 = self.upsampled_q4_1(self.q_propagate(flow4))
174 | 
175 |         concat3 = torch.cat((out_conv3, out_deconv3, flow4_up), 1)
176 |         flow3 = self.predict_flow3(concat3)
177 |         flow3_up = self.upsampled_flow3_2(flow3)
178 |         out_deconv2 = self.deconv2(concat3)
179 |         q_propagate3 = self.upsampled_q3_1(self.q_propagate(flow3))
180 | 
181 |         concat2 = torch.cat((out_conv2, out_deconv2, flow3_up), 1)
182 |         flow2 = self.predict_flow2(concat2)
183 |         q_propagate2 = self.upsampled_q2_1(self.q_propagate(flow2))
184 | 
185 |         q_propagate = torch.cat((q_propagate2 * 4, q_propagate3 * 8, q_propagate4 * 16, q_propagate5 * 32, q_propagate6 * 64), 1)
186 |         q_propagate = torch.mean(torch.abs(q_propagate), dim=3, keepdim=False)
187 |         q_propagate = torch.mean(torch.abs(q_propagate), dim=2, keepdim=False)
188 |         q_propagate = torch.mean(torch.abs(q_propagate), dim=1, keepdim=False)
189 | 
190 |         flow_result = self.upsampled_flow2_1(flow2)
191 | 
192 |         #flow_result = nn.functional.interpolate(flow_result, size=list(x.size())[2: ], mode='bilinear')
193 | 
194 |         if self.training:
195 |             return flow_result, [flow2, flow3, flow4, flow5, flow6], q_propagate
196 |         else:
197 |             return flow_result, q_propagate
198 | 


--------------------------------------------------------------------------------
/network/MultiBoxLoss.py:
--------------------------------------------------------------------------------
  1 | # source from: PytorchSSD-master
  2 | 
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch.autograd import Variable
  8 | from .box_utils import match, refine_match, log_sum_exp, decode
  9 | GPU = False
 10 | if torch.cuda.is_available():
 11 |     GPU = True
 12 |     torch.set_default_tensor_type('torch.cuda.FloatTensor')
 13 | 
 14 | 
 15 | class RefineMultiBoxLoss(nn.Module):
 16 |     """SSD Weighted Loss Function
 17 |     Compute Targets:
 18 |         1) Produce Confidence Target Indices by matching  ground truth boxes
 19 |            with (default) 'priorboxes' that have jaccard index > threshold parameter
 20 |            (default threshold: 0.5).
 21 |         2) Produce localization target by 'encoding' variance into offsets of ground
 22 |            truth boxes and their matched  'priorboxes'.
 23 |         3) Hard negative mining to filter the excessive number of negative examples
 24 |            that comes with using a large number of default bounding boxes.
 25 |            (default negative:positive ratio 3:1)
 26 |     Objective Loss:
 27 |         L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
 28 |         Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss
 29 |         weighted by α which is set to 1 by cross val.
 30 |         Args:
 31 |             c: class confidences,
 32 |             l: predicted boxes,
 33 |             g: ground truth boxes
 34 |             N: number of matched default boxes
 35 |         See: https://arxiv.org/pdf/1512.02325.pdf for more details.
 36 |     """
 37 | 
 38 |     def __init__(self, num_classes, overlap_thresh, prior_for_matching, bkg_label, neg_mining, neg_pos, neg_overlap, encode_target, object_score = 0):
 39 |         super(RefineMultiBoxLoss, self).__init__()
 40 |         self.num_classes = num_classes
 41 |         self.threshold = overlap_thresh
 42 |         self.background_label = bkg_label
 43 |         self.encode_target = encode_target
 44 |         self.use_prior_for_matching  = prior_for_matching
 45 |         self.do_neg_mining = neg_mining
 46 |         self.negpos_ratio = neg_pos
 47 |         self.neg_overlap = neg_overlap
 48 |         self.object_score = object_score
 49 |         self.variance = [0.1, 0.2]
 50 | 
 51 |     def forward(self, odm_data, priors, targets, arm_data=None, filter_object=False):
 52 |         """Multibox Loss
 53 |         Args:
 54 |             predictions (tuple): A tuple containing loc preds, conf preds,
 55 |             and prior boxes from SSD net.
 56 |                 conf shape: torch.size(batch_size,num_priors,num_classes)
 57 |                 loc shape: torch.size(batch_size,num_priors,4)
 58 |                 priors shape: torch.size(num_priors,4)
 59 | 
 60 |             ground_truth (tensor): Ground truth boxes and labels for a batch,
 61 |                 shape: [batch_size,num_objs,5] (last idx is the label).
 62 |             arm_data (tuple): arm branch containg arm_loc and arm_conf
 63 |             filter_object: whether filter out the  prediction according to the arm conf score
 64 |         """
 65 | 
 66 |         loc_data, conf_data = odm_data
 67 |         if arm_data:
 68 |             arm_loc, arm_conf = arm_data
 69 |             arm_loc = arm_loc.cpu()
 70 |             arm_conf = arm_conf.cpu()
 71 |         priors = priors.data
 72 |         num = loc_data.size(0)
 73 |         num_priors = (priors.size(0))
 74 | 
 75 |         # match priors (default boxes) and ground truth boxes
 76 |         loc_t = torch.Tensor(num, num_priors, 4)
 77 |         conf_t = torch.LongTensor(num, num_priors)
 78 |         for idx in range(num):
 79 |             truths = targets[idx][:, :-1].data
 80 |             labels = targets[idx][:, -1].data
 81 |             # for object detection
 82 |             if self.num_classes == 2:
 83 |                 labels = labels > 0
 84 |             if arm_data:
 85 |                 refine_match(self.threshold, truths, priors, self.variance, labels, loc_t, conf_t, idx, arm_loc[idx].data)
 86 |             else:
 87 |                 #loc_t[idx, :, :] = truths
 88 |                 #conf_t[idx, :] = labels
 89 |                 match(self.threshold, truths, priors.cpu(), self.variance, labels, loc_t, conf_t, idx)
 90 |         if GPU:
 91 |             loc_t = loc_t.cuda()
 92 |             conf_t = conf_t.cuda()
 93 | 
 94 |         # wrap targets
 95 |         loc_t = Variable(loc_t, requires_grad=False)
 96 |         conf_t = Variable(conf_t, requires_grad=False)
 97 |         if arm_data and filter_object:
 98 |             arm_conf_data = arm_conf.data[:, :, 1]
 99 |             pos = conf_t > 0
100 |             object_score_index = arm_conf_data <= self.object_score
101 |             pos[object_score_index] = 0
102 |         else:
103 |             pos = conf_t > 0
104 | 
105 |         # Localization Loss (Smooth L1)
106 |         # Shape: [batch,num_priors,4]
107 |         pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
108 |         loc_p = loc_data[pos_idx].view(-1, 4)
109 |         loc_t = loc_t[pos_idx].view(-1, 4)
110 |         loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False)
111 | 
112 |         # Compute max conf across batch for hard negative mining
113 |         batch_conf = conf_data.view(-1, self.num_classes)
114 |         loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1, 1))
115 | 
116 |         # Hard Negative Mining
117 |         loss_c = loss_c.view(num, -1)
118 |         loss_c[pos] = 0 # filter out pos boxes for now
119 |         _, loss_idx = loss_c.sort(1, descending=True)
120 |         _, idx_rank = loss_idx.sort(1)
121 |         num_pos = pos.long().sum(1, keepdim=True)
122 |         num_neg = torch.clamp(self.negpos_ratio * num_pos, max=pos.size(1) - 1)
123 |         neg = idx_rank < num_neg.expand_as(idx_rank)
124 | 
125 |         # Confidence Loss Including Positive and Negative Examples
126 |         pos_idx = pos.unsqueeze(2).expand_as(conf_data)
127 |         neg_idx = neg.unsqueeze(2).expand_as(conf_data)
128 |         conf_p = conf_data[(pos_idx + neg_idx).gt(0)].view(-1, self.num_classes)
129 |         targets_weighted = conf_t[(pos + neg).gt(0)]
130 |         loss_c = F.cross_entropy(conf_p, targets_weighted, size_average=False)
131 | 
132 |         # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
133 |         N = num_pos.data.sum().double()
134 |         loss_l, loss_c = loss_l.double(), loss_c.double()
135 |         loss_l /= N
136 |         loss_c /= N
137 |         return loss_l, loss_c
138 | 


--------------------------------------------------------------------------------
/network/RefineDet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from .network_utils import L2Norm
  5 | 
  6 | 
  7 | def vgg_layer_init(vgg_type='300', in_channels=3, batch_norm=True):
  8 |     vgg_cfg = {
  9 |         '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
 10 |                 512, 512, 512],
 11 |         '512': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
 12 |                 512, 512, 512],
 13 |     }
 14 | 
 15 |     vgg_list = []
 16 |     for layer in vgg_cfg[vgg_type]:
 17 |         if layer == 'M':
 18 |             vgg_list += [nn.MaxPool2d(kernel_size=2, stride=2)]
 19 |         elif layer == 'C':
 20 |             vgg_list += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
 21 |         elif batch_norm:
 22 |             vgg_list += [nn.Conv2d(in_channels, layer, kernel_size=3, padding=1, stride=1)]
 23 |             vgg_list += [nn.BatchNorm2d(layer)]
 24 |             vgg_list += [nn.ReLU(inplace=True)]
 25 |             in_channels = layer
 26 |         else:
 27 |             vgg_list += [nn.Conv2d(in_channels, layer, kernel_size=3, padding=1, stride=1)]
 28 |             vgg_list += [nn.ReLU(inplace=True)]
 29 |             in_channels = layer
 30 | 
 31 |     vgg_list += [nn.MaxPool2d(kernel_size=2, stride=2, padding=0)]
 32 |     vgg_list += [nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)]
 33 |     vgg_list += [nn.ReLU(inplace=True)]
 34 |     vgg_list += [nn.Conv2d(1024, 1024, kernel_size=1)]
 35 |     vgg_list += [nn.ReLU(inplace=True)]
 36 | 
 37 |     return vgg_list
 38 | 
 39 | 
 40 | class RefineDetArm(nn.Module):
 41 | 
 42 |     def __init__(self, vgg_type, in_channels, batch_norm=True):
 43 |         super(RefineDetArm, self).__init__()
 44 | 
 45 |         self.vgg_type = vgg_type
 46 |         self.in_channels = in_channels
 47 |         self.batch_norm = batch_norm
 48 | 
 49 |         self.vgg_list = nn.ModuleList(
 50 |             vgg_layer_init(in_channels=self.in_channels, batch_norm=False)
 51 |         )
 52 | 
 53 |         self.extras = nn.Sequential(
 54 |             nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0),
 55 |             nn.ReLU(inplace=True),
 56 |             nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1),
 57 |             nn.ReLU(inplace=True),
 58 |         )
 59 | 
 60 |         self.arm_sources = []
 61 |         self.arm_confidences = []
 62 |         self.arm_locations = []
 63 |         self.arm_conf = None
 64 |         self.arm_loc = None
 65 | 
 66 |         self.arm_location_layers = nn.ModuleList(
 67 |             [
 68 |                 nn.Conv2d(512, 12, kernel_size=3, stride=1, padding=1),
 69 |                 nn.Conv2d(512, 12, kernel_size=3, stride=1, padding=1),
 70 |                 nn.Conv2d(1024, 12, kernel_size=3, stride=1, padding=1),
 71 |                 nn.Conv2d(512, 12, kernel_size=3, stride=1, padding=1),
 72 |             ]
 73 |         )
 74 |         self.arm_confidence_layers = nn.ModuleList(
 75 |             [
 76 |                 nn.Conv2d(512, 6, kernel_size=3, stride=1, padding=1),
 77 |                 nn.Conv2d(512, 6, kernel_size=3, stride=1, padding=1),
 78 |                 nn.Conv2d(1024, 6, kernel_size=3, stride=1, padding=1),
 79 |                 nn.Conv2d(512, 6, kernel_size=3, stride=1, padding=1),
 80 |             ]
 81 |         )
 82 | 
 83 |     def forward(self, x):
 84 |         self.arm_sources.clear()
 85 |         self.arm_locations.clear()
 86 |         self.arm_confidences.clear()
 87 | 
 88 |         for layer in self.vgg_list[: 23]:
 89 |             x = layer(x)
 90 |         self.arm_sources.append(x)
 91 | 
 92 |         for layer in self.vgg_list[23: 30]:
 93 |             x = layer(x)
 94 |         self.arm_sources.append(x)
 95 | 
 96 |         for layer in self.vgg_list[30:]:
 97 |             x = layer(x)
 98 |         self.arm_sources.append(x)
 99 | 
100 |         x = self.extras(x)
101 |         self.arm_sources.append(x)
102 | 
103 |         for arm_source, arm_confidence_layer, arm_location_layer in zip(self.arm_sources, self.arm_confidence_layers,
104 |                                                                         self.arm_location_layers):
105 |             self.arm_confidences.append(arm_confidence_layer(arm_source).permute(0, 2, 3, 1).contiguous())
106 |             self.arm_locations.append(arm_location_layer(arm_source).permute(0, 2, 3, 1).contiguous())
107 |         # why use transpose and reshape?
108 |         arm_conf = torch.cat([item.view(item.size(0), -1) for item in self.arm_confidences], 1)
109 |         arm_loc = torch.cat([item.view(item.size(0), -1) for item in self.arm_locations], 1)
110 | 
111 |         return x, self.arm_sources, arm_conf, arm_loc
112 | 
113 | 
114 | class RefineDetObm(nn.Module):
115 | 
116 |     def __init__(self, num_classes, batch_norm=True):
117 |         super(RefineDetObm, self).__init__()
118 | 
119 |         self.last_layer_trans = nn.Sequential(
120 |             nn.Conv2d(512, 256, kernel_size=3, stride=1, padding=1),
121 |             nn.ReLU(inplace=True),
122 |             nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
123 |             nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
124 |         )
125 | 
126 |         self.L2Norm_4_3 = L2Norm(512, 10)
127 |         self.L2Norm_5_3 = L2Norm(512, 8)
128 | 
129 |         self.obm_sources = []
130 |         self.obm_confidences = []
131 |         self.obm_locations = []
132 |         self.transfer_list = []
133 | 
134 |         self.arm_sources = None
135 |         self.arm_conf = None
136 |         self.arm_loc = None
137 |         self.obm_conf = None
138 |         self.obm_loc = None
139 |         self.trans_result = None
140 | 
141 |         self.num_classes = num_classes
142 | 
143 |         self.obm_location_layers = nn.ModuleList(
144 |             [
145 |                 nn.Conv2d(256, 12, kernel_size=3, stride=1, padding=1),
146 |                 nn.Conv2d(256, 12, kernel_size=3, stride=1, padding=1),
147 |                 nn.Conv2d(256, 12, kernel_size=3, stride=1, padding=1),
148 |                 nn.Conv2d(256, 12, kernel_size=3, stride=1, padding=1),
149 |             ]
150 |         )
151 |         self.obm_confidence_layers = nn.ModuleList(
152 |             [
153 |                 nn.Conv2d(256, 3 * self.num_classes, kernel_size=3, stride=1, padding=1),
154 |                 nn.Conv2d(256, 3 * self.num_classes, kernel_size=3, stride=1, padding=1),
155 |                 nn.Conv2d(256, 3 * self.num_classes, kernel_size=3, stride=1, padding=1),
156 |                 nn.Conv2d(256, 3 * self.num_classes, kernel_size=3, stride=1, padding=1),
157 |             ]
158 |         )
159 |         self.transfer_layers = nn.ModuleList(
160 |             [
161 |                 nn.Sequential(
162 |                     nn.Conv2d(512, 256, kernel_size=3, stride=1, padding=1),
163 |                     nn.ReLU(inplace=True),
164 |                     nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
165 |                 ),
166 |                 nn.Sequential(
167 |                     nn.Conv2d(512, 256, kernel_size=3, stride=1, padding=1),
168 |                     nn.ReLU(inplace=True),
169 |                     nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
170 |                 ),
171 |                 nn.Sequential(
172 |                     nn.Conv2d(1024, 256, kernel_size=3, stride=1, padding=1),
173 |                     nn.ReLU(inplace=True),
174 |                     nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
175 |                 ),
176 |             ]
177 |         )
178 |         self.upconv_layers = nn.ModuleList(
179 |             [
180 |                 nn.ConvTranspose2d(256, 256, kernel_size=2, stride=2, padding=0),
181 |                 nn.ConvTranspose2d(256, 256, kernel_size=2, stride=2, padding=0),
182 |                 nn.ConvTranspose2d(256, 256, kernel_size=2, stride=2, padding=0),
183 |             ]
184 |         )
185 |         self.latent_layers = nn.ModuleList(
186 |             [
187 |                 nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
188 |                 nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
189 |                 nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
190 |             ]
191 |         )
192 |         self.softmax = nn.Softmax()
193 | 
194 |     # if "is_propagate" is set to True, then "previous_layer" is needed.
195 |     # currently simply use previous feature maps
196 |     def forward(self, x, arm_sources, arm_conf, arm_loc, is_training=True):
197 | 
198 |         self.obm_sources.clear()
199 |         self.obm_confidences.clear()
200 |         self.obm_locations.clear()
201 |         self.transfer_list.clear()
202 | 
203 |         self.arm_sources = arm_sources
204 |         self.arm_conf = arm_conf
205 |         self.arm_loc = arm_loc
206 | 
207 |         x = self.last_layer_trans(x)
208 |         self.obm_sources.append(x)
209 | 
210 |         for arm_source, transfer_layer in zip(self.arm_sources, self.transfer_layers):
211 |             self.transfer_list.append(transfer_layer(arm_source))
212 | 
213 |         self.transfer_list.reverse()
214 |         self.arm_sources.reverse()
215 | 
216 |         for transfer_item, upconv_item, latent_item in zip(self.transfer_list, self.upconv_layers, self.latent_layers):
217 |             x = F.relu(latent_item(F.relu(upconv_item(x) + transfer_item, inplace=True)), inplace=True)
218 |             self.obm_sources.append(x)
219 | 
220 |         for obm_source, obm_confidence_layer, obm_location_layer in zip(self.obm_sources, self.obm_confidence_layers, self.obm_location_layers):
221 |             self.obm_confidences.append(obm_confidence_layer(obm_source).permute(0, 2, 3, 1).contiguous())
222 |             self.obm_locations.append(obm_location_layer(obm_source).permute(0, 2, 3, 1).contiguous())
223 |         self.obm_conf = torch.cat([item.view(item.size(0), -1) for item in self.obm_confidences], 1)
224 |         self.obm_loc = torch.cat([item.view(item.size(0), -1) for item in self.obm_locations], 1)
225 | 
226 |         output = (
227 |             self.arm_sources[-1],
228 |             self.softmax(self.arm_conf.view(self.arm_conf.size(0), -1, 2)),
229 |             self.arm_loc.view(self.arm_loc.size(0), -1, 4),
230 |             self.softmax(self.obm_conf.view(self.obm_conf.size(0), -1, self.num_classes)),
231 |             self.obm_loc.view(self.obm_loc.size(0), -1, 4),
232 |             #None, # updating mask
233 |         )
234 | 
235 |         return output
236 | 
237 | 


--------------------------------------------------------------------------------
/network/__pycache__/FlowNet.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zzzkkkyyy/Video-Object-Detection/bc57ece447620be4b2980daa82b187a391a893a1/network/__pycache__/FlowNet.cpython-36.pyc


--------------------------------------------------------------------------------
/network/__pycache__/MultiBoxLoss.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zzzkkkyyy/Video-Object-Detection/bc57ece447620be4b2980daa82b187a391a893a1/network/__pycache__/MultiBoxLoss.cpython-36.pyc


--------------------------------------------------------------------------------
/network/__pycache__/RefineDet.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zzzkkkyyy/Video-Object-Detection/bc57ece447620be4b2980daa82b187a391a893a1/network/__pycache__/RefineDet.cpython-36.pyc


--------------------------------------------------------------------------------
/network/__pycache__/box_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zzzkkkyyy/Video-Object-Detection/bc57ece447620be4b2980daa82b187a391a893a1/network/__pycache__/box_utils.cpython-36.pyc


--------------------------------------------------------------------------------
/network/__pycache__/network_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zzzkkkyyy/Video-Object-Detection/bc57ece447620be4b2980daa82b187a391a893a1/network/__pycache__/network_utils.cpython-36.pyc


--------------------------------------------------------------------------------
/network/__pycache__/prior_box.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zzzkkkyyy/Video-Object-Detection/bc57ece447620be4b2980daa82b187a391a893a1/network/__pycache__/prior_box.cpython-36.pyc


--------------------------------------------------------------------------------
/network/box_utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import math
  4 | import numpy as np
  5 | if torch.cuda.is_available():
  6 |     import torch.backends.cudnn as cudnn
  7 |     torch.set_default_tensor_type('torch.cuda.FloatTensor')
  8 | 
  9 | 
 10 | def point_form(boxes):
 11 |     """ Convert prior_boxes to (xmin, ymin, xmax, ymax)
 12 |     representation for comparison to point form ground truth data.
 13 |     Args:
 14 |         boxes: (tensor) center-size default boxes from priorbox layers.
 15 |     Return:
 16 |         boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
 17 |     """
 18 |     return torch.cat((boxes[:, :2] - boxes[:, 2:]/2,     # xmin, ymin
 19 |                      boxes[:, :2] + boxes[:, 2:]/2), 1)  # xmax, ymax
 20 | 
 21 | 
 22 | def center_size(boxes):
 23 |     """ Convert prior_boxes to (cx, cy, w, h)
 24 |     representation for comparison to center-size form ground truth data.
 25 |     Args:
 26 |         boxes: (tensor) point_form boxes
 27 |     Return:
 28 |         boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
 29 |     """
 30 |     return torch.cat([(boxes[:, 2:] + boxes[:, :2])/2,  # cx, cy
 31 |                      boxes[:, 2:] - boxes[:, :2]], 1)  # w, h
 32 | 
 33 | 
 34 | def intersect(box_a, box_b):
 35 |     """ We resize both tensors to [A,B,2] without new malloc:
 36 |     [A,2] -> [A,1,2] -> [A,B,2]
 37 |     [B,2] -> [1,B,2] -> [A,B,2]
 38 |     Then we compute the area of intersect between box_a and box_b.
 39 |     Args:
 40 |       box_a: (tensor) bounding boxes, Shape: [A,4].
 41 |       box_b: (tensor) bounding boxes, Shape: [B,4].
 42 |     Return:
 43 |       (tensor) intersection area, Shape: [A,B].
 44 |     """
 45 |     A = box_a.size(0)
 46 |     B = box_b.size(0)
 47 |     max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
 48 |                        box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
 49 |     min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
 50 |                        box_b[:, :2].unsqueeze(0).expand(A, B, 2))
 51 |     inter = torch.clamp((max_xy - min_xy), min=0)
 52 |     return inter[:, :, 0] * inter[:, :, 1]
 53 | 
 54 | 
 55 | def jaccard(box_a, box_b):
 56 |     """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
 57 |     is simply the intersection over union of two boxes.  Here we operate on
 58 |     ground truth boxes and default boxes.
 59 |     E.g.:
 60 |         A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
 61 |     Args:
 62 |         box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
 63 |         box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
 64 |     Return:
 65 |         jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
 66 |     """
 67 |     inter = intersect(box_a, box_b)
 68 |     area_a = ((box_a[:, 2]-box_a[:, 0]) *
 69 |               (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
 70 |     area_b = ((box_b[:, 2]-box_b[:, 0]) *
 71 |               (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
 72 |     union = area_a + area_b - inter
 73 |     return inter / union  # [A,B]
 74 | 
 75 | def matrix_iou(a,b):
 76 |     """
 77 |     return iou of a and b, numpy version for data augenmentation
 78 |     """
 79 |     lt = np.maximum(a[:, np.newaxis, :2], b[:, :2])
 80 |     rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
 81 | 
 82 |     area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2)
 83 |     area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
 84 |     area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
 85 |     return area_i / (area_a[:, np.newaxis] + area_b - area_i)
 86 | 
 87 | 
 88 | def match(threshold, truths, priors, variances, labels, loc_t, conf_t, idx):
 89 |     """Match each prior box with the ground truth box of the highest jaccard
 90 |     overlap, encode the bounding boxes, then return the matched indices
 91 |     corresponding to both confidence and location preds.
 92 |     Args:
 93 |         threshold: (float) The overlap threshold used when mathing boxes.
 94 |         truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors].
 95 |         priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4].
 96 |         variances: (tensor) Variances corresponding to each prior coord,
 97 |             Shape: [num_priors, 4].
 98 |         labels: (tensor) All the class labels for the image, Shape: [num_obj].
 99 |         loc_t: (tensor) Tensor to be filled w/ endcoded location targets.
100 |         conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds.
101 |         idx: (int) current batch index
102 |     Return:
103 |         The matched indices corresponding to 1)location and 2)confidence preds.
104 |     """
105 |     # jaccard index
106 |     overlaps = jaccard(
107 |         truths,
108 |         point_form(priors)
109 |     )
110 |     # (Bipartite Matching)
111 |     # [1,num_objects] best prior for each ground truth
112 |     best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True)
113 |     # [1,num_priors] best ground truth for each prior
114 |     best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True)
115 |     best_truth_idx.squeeze_(0)
116 |     best_truth_overlap.squeeze_(0)
117 |     best_prior_idx.squeeze_(1)
118 |     best_prior_overlap.squeeze_(1)
119 |     best_truth_overlap.index_fill_(0, best_prior_idx, 2)  # ensure best prior
120 |     # TODO refactor: index  best_prior_idx with long tensor
121 |     # ensure every gt matches with its prior of max overlap
122 |     for j in range(best_prior_idx.size(0)):
123 |         best_truth_idx[best_prior_idx[j]] = j
124 |     matches = truths[best_truth_idx]          # Shape: [num_priors,4]
125 |     conf = labels[best_truth_idx]          # Shape: [num_priors]
126 |     conf[best_truth_overlap < threshold] = 0  # label as background
127 |     loc = encode(matches, priors, variances)
128 |     loc_t[idx] = loc    # [num_priors,4] encoded offsets to learn
129 |     conf_t[idx] = conf  # [num_priors] top class label for each prior
130 | 
131 | def refine_match(threshold, truths, priors, variances, labels, loc_t, conf_t, idx, arm_loc):
132 |     """Match each arm bbox with the ground truth box of the highest jaccard
133 |     overlap, encode the bounding boxes, then return the matched indices
134 |     corresponding to both confidence and location preds.
135 |     Args:
136 |         threshold: (float) The overlap threshold used when mathing boxes.
137 |         truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors].
138 |         priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4].
139 |         variances: (tensor) Variances corresponding to each prior coord,
140 |             Shape: [num_priors, 4].
141 |         labels: (tensor) All the class labels for the image, Shape: [num_obj].
142 |         loc_t: (tensor) Tensor to be filled w/ endcoded location targets.
143 |         conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds.
144 |         idx: (int) current batch index
145 |         arm_loc: (tensor) arm loc data,shape: [n_priors,4]
146 |     Return:
147 |         The matched indices corresponding to 1)location and 2)confidence preds.
148 |     """
149 |     # decode arm box
150 |     decode_arm = decode(arm_loc, priors=priors, variances=variances)
151 |     # jaccard index
152 |     overlaps = jaccard(
153 |         truths,
154 |         decode_arm
155 |     )
156 |     # (Bipartite Matching)
157 |     # [1,num_objects] best prior for each ground truth
158 |     best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True)
159 |     # [1,num_priors] best ground truth for each prior
160 |     best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True)
161 |     best_truth_idx.squeeze_(0)
162 |     best_truth_overlap.squeeze_(0)
163 |     best_prior_idx.squeeze_(1)
164 |     best_prior_overlap.squeeze_(1)
165 |     best_truth_overlap.index_fill_(0, best_prior_idx, 2)  # ensure best prior
166 |     # TODO refactor: index  best_prior_idx with long tensor
167 |     # ensure every gt matches with its prior of max overlap
168 |     for j in range(best_prior_idx.size(0)):
169 |         best_truth_idx[best_prior_idx[j]] = j
170 |     matches = truths[best_truth_idx]          # Shape: [num_priors,4]
171 |     conf = labels[best_truth_idx]          # Shape: [num_priors]
172 |     conf[best_truth_overlap < threshold] = 0  # label as background
173 |     loc = encode(matches, center_size(decode_arm), variances)
174 |     loc_t[idx] = loc    # [num_priors,4] encoded offsets to learn
175 |     conf_t[idx] = conf  # [num_priors] top class label for each prior
176 | 
177 | def encode(matched, priors, variances):
178 |     """Encode the variances from the priorbox layers into the ground truth boxes
179 |     we have matched (based on jaccard overlap) with the prior boxes.
180 |     Args:
181 |         matched: (tensor) Coords of ground truth for each prior in point-form
182 |             Shape: [num_priors, 4].
183 |         priors: (tensor) Prior boxes in center-offset form
184 |             Shape: [num_priors,4].
185 |         variances: (list[float]) Variances of priorboxes
186 |     Return:
187 |         encoded boxes (tensor), Shape: [num_priors, 4]
188 |     """
189 | 
190 |     # dist b/t match center and prior's center
191 |     g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2]
192 |     # encode variance
193 |     g_cxcy /= (variances[0] * priors[:, 2:])
194 |     # match wh / prior wh
195 |     g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
196 |     g_wh = torch.log(g_wh) / variances[1]
197 |     # return target for smooth_l1_loss
198 |     return torch.cat([g_cxcy, g_wh], 1)  # [num_priors,4]
199 | 
200 | 
201 | def encode_multi(matched, priors, offsets, variances):
202 |     """Encode the variances from the priorbox layers into the ground truth boxes
203 |     we have matched (based on jaccard overlap) with the prior boxes.
204 |     Args:
205 |         matched: (tensor) Coords of ground truth for each prior in point-form
206 |             Shape: [num_priors, 4].
207 |         priors: (tensor) Prior boxes in center-offset form
208 |             Shape: [num_priors,4].
209 |         variances: (list[float]) Variances of priorboxes
210 |     Return:
211 |         encoded boxes (tensor), Shape: [num_priors, 4]
212 |     """
213 | 
214 |     # dist b/t match center and prior's center
215 |     g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2] - offsets[:,:2]
216 |     # encode variance
217 |     #g_cxcy /= (variances[0] * priors[:, 2:])
218 |     g_cxcy.div_(variances[0] * offsets[:, 2:])
219 |     # match wh / prior wh
220 |     g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
221 |     g_wh = torch.log(g_wh) / variances[1]
222 |     # return target for smooth_l1_loss
223 |     return torch.cat([g_cxcy, g_wh], 1)  # [num_priors,4]
224 | 
225 | # Adapted from https://github.com/Hakuyume/chainer-ssd
226 | def decode(loc, priors, variances):
227 |     """Decode locations from predictions using priors to undo
228 |     the encoding we did for offset regression at train time.
229 |     Args:
230 |         loc (tensor): location predictions for loc layers,
231 |             Shape: [num_priors,4]
232 |         priors (tensor): Prior boxes in center-offset form.
233 |             Shape: [num_priors,4].
234 |         variances: (list[float]) Variances of priorboxes
235 |     Return:
236 |         decoded bounding box predictions
237 |     """
238 | 
239 |     boxes = torch.cat((
240 |         priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
241 |         priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
242 |     boxes[:, :2] -= boxes[:, 2:] / 2
243 |     boxes[:, 2:] += boxes[:, :2]
244 |     return boxes
245 | 
246 | def decode_multi(loc, priors, offsets, variances):
247 |     """Decode locations from predictions using priors to undo
248 |     the encoding we did for offset regression at train time.
249 |     Args:
250 |         loc (tensor): location predictions for loc layers,
251 |             Shape: [num_priors,4]
252 |         priors (tensor): Prior boxes in center-offset form.
253 |             Shape: [num_priors,4].
254 |         variances: (list[float]) Variances of priorboxes
255 |     Return:
256 |         decoded bounding box predictions
257 |     """
258 | 
259 |     boxes = torch.cat((
260 |         priors[:, :2] + offsets[:,:2]+ loc[:, :2] * variances[0] * offsets[:, 2:],
261 |         priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
262 |     boxes[:, :2] -= boxes[:, 2:] / 2
263 |     boxes[:, 2:] += boxes[:, :2]
264 |     return boxes
265 | 
266 | def log_sum_exp(x):
267 |     """Utility function for computing log_sum_exp while determining
268 |     This will be used to determine unaveraged confidence loss across
269 |     all examples in a batch.
270 |     Args:
271 |         x (Variable(tensor)): conf_preds from conf layers
272 |     """
273 |     x_max = x.data.max()
274 |     return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max
275 | 
276 | 
277 | # Original author: Francisco Massa:
278 | # https://github.com/fmassa/object-detection.torch
279 | # Ported to PyTorch by Max deGroot (02/01/2017)
280 | def nms(boxes, scores, overlap=0.5, top_k=200):
281 |     """Apply non-maximum suppression at test time to avoid detecting too many
282 |     overlapping bounding boxes for a given object.
283 |     Args:
284 |         boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
285 |         scores: (tensor) The class predscores for the img, Shape:[num_priors].
286 |         overlap: (float) The overlap thresh for suppressing unnecessary boxes.
287 |         top_k: (int) The Maximum number of box preds to consider.
288 |     Return:
289 |         The indices of the kept boxes with respect to num_priors.
290 |     """
291 | 
292 |     keep = torch.Tensor(scores.size(0)).fill_(0).long()
293 |     if boxes.numel() == 0:
294 |         return keep
295 |     x1 = boxes[:, 0]
296 |     y1 = boxes[:, 1]
297 |     x2 = boxes[:, 2]
298 |     y2 = boxes[:, 3]
299 |     area = torch.mul(x2 - x1, y2 - y1)
300 |     v, idx = scores.sort(0)  # sort in ascending order
301 |     # I = I[v >= 0.01]
302 |     idx = idx[-top_k:]  # indices of the top-k largest vals
303 |     xx1 = boxes.new()
304 |     yy1 = boxes.new()
305 |     xx2 = boxes.new()
306 |     yy2 = boxes.new()
307 |     w = boxes.new()
308 |     h = boxes.new()
309 | 
310 |     # keep = torch.Tensor()
311 |     count = 0
312 |     while idx.numel() > 0:
313 |         i = idx[-1]  # index of current largest val
314 |         # keep.append(i)
315 |         keep[count] = i
316 |         count += 1
317 |         if idx.size(0) == 1:
318 |             break
319 |         idx = idx[:-1]  # remove kept element from view
320 |         # load bboxes of next highest vals
321 |         torch.index_select(x1, 0, idx, out=xx1)
322 |         torch.index_select(y1, 0, idx, out=yy1)
323 |         torch.index_select(x2, 0, idx, out=xx2)
324 |         torch.index_select(y2, 0, idx, out=yy2)
325 |         # store element-wise max with next highest score
326 |         xx1 = torch.clamp(xx1, min=x1[i])
327 |         yy1 = torch.clamp(yy1, min=y1[i])
328 |         xx2 = torch.clamp(xx2, max=x2[i])
329 |         yy2 = torch.clamp(yy2, max=y2[i])
330 |         w.resize_as_(xx2)
331 |         h.resize_as_(yy2)
332 |         w = xx2 - xx1
333 |         h = yy2 - yy1
334 |         # check sizes of xx1 and xx2.. after each iteration
335 |         w = torch.clamp(w, min=0.0)
336 |         h = torch.clamp(h, min=0.0)
337 |         inter = w*h
338 |         # IoU = i / (area(a) + area(b) - i)
339 |         rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
340 |         union = (rem_areas - inter) + area[i]
341 |         IoU = inter/union  # store result in iou
342 |         # keep only elements with an IoU <= overlap
343 |         idx = idx[IoU.le(overlap)]
344 |     return keep, count
345 | 


--------------------------------------------------------------------------------
/network/network_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.autograd import Function
 4 | from torch.autograd import Variable
 5 | import torch.nn.init as init
 6 | 
 7 | class L2Norm(nn.Module):
 8 |     def __init__(self, n_channels, gamma=None):
 9 |         super(L2Norm, self).__init__()
10 |         self.n_channels = n_channels
11 |         self.gamma = gamma
12 |         self.eps = 1e-10
13 |         self.weight = nn.Parameter(torch.Tensor(self.n_channels))
14 |         self.reset_parameters()
15 | 
16 |     def reset_parameters(self):
17 |         init.constant(self.weight, self.gamma)
18 | 
19 |     def forward(self, x):
20 |         norm = x.pow(x).sum(dim=1, keepdim=True).sqrt() + self.eps
21 |         x /= norm
22 |         out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
23 |         return out
24 | 
25 | class EmbeddingNetwork(nn.Module):
26 | 
27 |     def __init__(self, in_channels):
28 |         super(EmbeddingNetwork, self).__init__()
29 |         self.conv1 = nn.Conv2d(in_channels, 512, kernel_size=1, stride=1, padding=0)
30 |         self.conv2 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
31 |         self.conv3 = nn.Conv2d(512, 2048, kernel_size=1, stride=1, padding=0)
32 |         self.global_avg_pool = nn.AdaptiveAvgPool2d(1)
33 | 
34 |     def forward(self, x):
35 |         x = self.conv1(x)
36 |         x = self.conv2(x)
37 |         x = self.conv3(x)
38 |         x = self.global_avg_pool(x)
39 |         x = x.squeeze(-1).squeeze(-1)
40 | 
41 |         return x


--------------------------------------------------------------------------------
/network/prior_box.py:
--------------------------------------------------------------------------------
 1 | from itertools import product as product
 2 | from math import sqrt as sqrt
 3 | 
 4 | import torch
 5 | import numpy as np
 6 | 
 7 | if torch.cuda.is_available():
 8 |     torch.set_default_tensor_type('torch.cuda.FloatTensor')
 9 | 
10 | 
11 | class PriorBox(object):
12 |     """Compute priorbox coordinates in center-offset form for each source
13 |     feature map.
14 |     Note:
15 |     This 'layer' has changed between versions of the original SSD
16 |     paper, so we include both versions, but note v2 is the most tested and most
17 |     recent version of the paper.
18 | 
19 |     """
20 | 
21 |     def __init__(self, cfg):
22 |         super(PriorBox, self).__init__()
23 |         self.image_size = cfg['min_dim']
24 |         # number of priors for feature map location (either 4 or 6)
25 |         self.num_priors = len(cfg['aspect_ratios'])
26 |         self.variance = cfg['variance'] or [0.1]
27 |         self.feature_maps = cfg['feature_maps']
28 |         self.min_sizes = cfg['min_sizes']
29 |         self.max_sizes = cfg['max_sizes']
30 |         self.steps = cfg['steps']
31 |         self.aspect_ratios = cfg['aspect_ratios']
32 |         self.clip = cfg['clip']
33 |         for v in self.variance:
34 |             if v <= 0:
35 |                 raise ValueError('Variances must be greater than 0')
36 | 
37 |     def forward(self):
38 |         mean = []
39 |         for k, f in enumerate(self.feature_maps):
40 |             for i, j in product(range(f), repeat=2):
41 |                 f_k = self.image_size / self.steps[k]
42 |                 cx = (j + 0.5) / f_k
43 |                 cy = (i + 0.5) / f_k
44 | 
45 |                 s_k = self.min_sizes[k] / self.image_size
46 |                 mean += [cx, cy, s_k, s_k]
47 | 
48 |                 # aspect_ratio: 1
49 |                 # rel size: sqrt(s_k * s_(k+1))
50 |                 if self.max_sizes:
51 |                     s_k_prime = sqrt(s_k * (self.max_sizes[k] / self.image_size))
52 |                     mean += [cx, cy, s_k_prime, s_k_prime]
53 | 
54 |                 # rest of aspect ratios
55 |                 for ar in self.aspect_ratios[k]:
56 |                     mean += [cx, cy, s_k * sqrt(ar), s_k / sqrt(ar)]
57 |                     mean += [cx, cy, s_k / sqrt(ar), s_k * sqrt(ar)]
58 | 
59 |         # back to torch land
60 |         output = np.array(mean).reshape(-1, 4)
61 |         #output = torch.Tensor(mean).view(-1, 4)
62 |         if self.clip:
63 |             output = np.clip(output, 0., 1.)
64 |             #output.clamp_(max=1, min=0)
65 |         return output
66 | 


--------------------------------------------------------------------------------