├── .gitignore ├── README.md ├── dataset ├── __pycache__ │ ├── bosch.cpython-36.pyc │ ├── data_preprocessing.cpython-36.pyc │ └── transforms.cpython-36.pyc ├── bosch.py ├── data_preprocessing.py └── transforms.py ├── main.py ├── models └── bosch-dataset-labels.txt └── network ├── FlowNet.py ├── MultiBoxLoss.py ├── RefineDet.py ├── __pycache__ ├── FlowNet.cpython-36.pyc ├── MultiBoxLoss.cpython-36.pyc ├── RefineDet.cpython-36.pyc ├── box_utils.cpython-36.pyc ├── network_utils.cpython-36.pyc └── prior_box.cpython-36.pyc ├── box_utils.py ├── network_utils.py └── prior_box.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Video-Object-Detection (Still in progress) 2 | ## Description 3 | Based on the paper: "Towards High Performance Video Object Detection" use Pytorch 0.4.1 and Python 3.6 4 | 5 | The model is currently running on Bosch Traffic Light Dataset only, but it will be easy to add another dataset by modifying dataloader. 6 | 7 | For training simply use 'python main.py' and set args according to your need. 8 | 9 | ## Reference Links 10 | The RefineDet's code is inspired by https://github.com/lzx1413/PytorchSSD. 11 | 12 | The vgg pretrained model is downloaded from https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth. 13 | 14 | The FlowNet's code and pretrained model are inspired and downloaded from https://github.com/NVIDIA/flownet2-pytorch. 15 | 16 | Please download the above 2 pretrained models and place them into 'models' directory. 17 | 18 | ## Others 19 | Some self-understanding about the paper's design (may be some are wrong, I will go deeper into those points later): 20 | * In training process, two frames are randomly selected with former one as key-frame and later one as non-key-frame. 21 | * The "q_propagate" factor is normalized to 0-1 during training, in case for preserving gradient and allow backward propagating. In inference process, it should be either 0 or 1. 22 | * is_aggr and is_partial flag is both set to True for all frame-pairs during training since each batch has few key-frame. In inference process they should be treated differently. 23 | 24 | Some self-modification: 25 | * I use RefineDet instead of single ResNet as base detection network. Therefore, the results of flownet is also used in some middle source layers in addition to final layer. 26 | 27 | TODO: 28 | * add multiple gpu's support. (Forgive I am a beginner to Pytorch orz.) 29 | * optimize the distribution of tensors that on CPU or GPU. 30 | * add inference part. 31 | * ... 32 | -------------------------------------------------------------------------------- /dataset/__pycache__/bosch.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zzzkkkyyy/Video-Object-Detection/bc57ece447620be4b2980daa82b187a391a893a1/dataset/__pycache__/bosch.cpython-36.pyc -------------------------------------------------------------------------------- /dataset/__pycache__/data_preprocessing.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zzzkkkyyy/Video-Object-Detection/bc57ece447620be4b2980daa82b187a391a893a1/dataset/__pycache__/data_preprocessing.cpython-36.pyc -------------------------------------------------------------------------------- /dataset/__pycache__/transforms.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zzzkkkyyy/Video-Object-Detection/bc57ece447620be4b2980daa82b187a391a893a1/dataset/__pycache__/transforms.cpython-36.pyc -------------------------------------------------------------------------------- /dataset/bosch.py: -------------------------------------------------------------------------------- 1 | import torch, torchvision 2 | import numpy as np 3 | from torch.utils.data import Dataset 4 | import yaml, os, sys, time, io 5 | from PIL import Image 6 | 7 | # BOSCH_ROOT = os.path.join(HOME, 'Bosch_Dataset/') 8 | INDEX2LABEL = {0: 'None', 1: 'Green', 2: 'GreenStraightLeft', 3: 'GreenLeft', 4: 'RedLeft', 5: 'GreenStraightRight', 9 | 6: 'Red', 7: 'off', \ 10 | 8: 'GreenRight', 9: 'GreenStraight', 10: 'Yellow', 11: 'RedRight', 12: 'RedStraight', 11 | 13: 'RedStraightLeft'} 12 | BOSCH_CLASSES = ['None', 'Green', 'GreenStraightLeft', 'GreenLeft', 'RedLeft', 'GreenStraightRight', 'Red', 'off', \ 13 | 'GreenRight', 'GreenStraight', 'Yellow', 'RedRight', 'RedStraight', 'RedStraightLeft'] 14 | 15 | 16 | def detection_collate(batch): 17 | """Custom collate fn for dealing with batches of images that have a different 18 | number of associated object annotations (bounding boxes). 19 | 20 | Arguments: 21 | batch: (tuple) A tuple of tensor images and lists of annotations 22 | 23 | Return: 24 | A tuple containing: 25 | 1) (tensor) batch of images stacked on their 0 dim 26 | 2) (list of tensors) annotations for a given image are stacked on 0 dim 27 | """ 28 | targets = [] 29 | imgs = [] 30 | for _, sample in enumerate(batch): 31 | for _, tup in enumerate(sample): 32 | if torch.is_tensor(tup): 33 | imgs.append(tup) 34 | elif isinstance(tup, type(np.empty(0))): 35 | annos = torch.from_numpy(tup).float() 36 | targets.append(annos) 37 | 38 | return (torch.stack(imgs, 0), targets) 39 | 40 | 41 | class BoschTrainDetection(Dataset): 42 | decode_yaml_file = None 43 | color2index = {} 44 | index2color = {} 45 | class_names = BOSCH_CLASSES 46 | 47 | def __init__(self, root_dir, yaml_file, transform=None, target_transform=None): 48 | self.root_dir = root_dir 49 | self.yaml_file = yaml_file 50 | self.transform = transform 51 | self.target_transform = target_transform 52 | 53 | if not os.path.exists(os.path.join(self.root_dir, self.yaml_file)): 54 | print(os.path.join(self.root_dir, self.yaml_file)) 55 | print("input path parameters not valid.") 56 | return None 57 | 58 | self.color2index = {} 59 | self.index2color = {} 60 | color_array = [] 61 | 62 | # return as a dict 63 | self.decode_yaml_file = yaml.load(open(os.path.join(self.root_dir, self.yaml_file))) 64 | self.decode_yaml_file = list(filter(lambda x: len(x['boxes']) != 0, self.decode_yaml_file)) 65 | for item in self.decode_yaml_file: 66 | for box in item['boxes']: 67 | if box['label'] not in color_array: 68 | color_array.append(box['label']) 69 | 70 | for index in range(len(color_array)): 71 | self.color2index[color_array[index]] = index + 1 72 | self.index2color[index + 1] = color_array[index] 73 | self.color2index['bg'] = 0 74 | self.index2color[0] = 'bg' 75 | 76 | self.threshold = 0.5 77 | self.variance = [0.1, 0.2] 78 | 79 | self.decode_dataset = [] 80 | 81 | count = 0 82 | 83 | for element in self.decode_yaml_file: 84 | if len(element['boxes']) == 0: 85 | continue 86 | dataset_element = {} 87 | dataset_element['path'] = element['path'] 88 | dataset_element['annotation'] = [] 89 | for box_info in element['boxes']: 90 | dataset_element['annotation'].append(np.array( 91 | [box_info['x_min'], box_info['y_min'], box_info['x_max'], box_info['y_max'], 92 | self.color2index[box_info['label']]]).astype(np.float32)) 93 | dataset_element['annotation'] = np.array(dataset_element['annotation']).astype(np.float32) 94 | # dataset_element['annotation'] = np.transpose(np.array(dataset_element['annotation']), (1, 0)) 95 | self.decode_dataset.append(dataset_element) 96 | 97 | count += 1 98 | if count == 100: 99 | break 100 | 101 | image_name = os.path.join(self.root_dir, self.decode_dataset[0]['path']) 102 | #print("image shape:", np.array(Image.open(image_name)).shape) 103 | 104 | del self.decode_yaml_file 105 | 106 | def __len__(self): 107 | return len(self.decode_dataset) 108 | 109 | def __getitem__(self, index): 110 | image_name = os.path.join(self.root_dir, self.decode_dataset[index]['path']) 111 | image = Image.open(image_name) 112 | image = np.array(image) 113 | boxes = self.decode_dataset[index]['annotation'][:, : 4] 114 | labels = self.decode_dataset[index]['annotation'][:, 4] 115 | if self.transform: 116 | image, boxes, labels = self.transform(image, boxes, labels) 117 | if self.target_transform: 118 | """ 119 | print(self.priors) 120 | loc_t = torch.Tensor(1, self.priors.shape[0], 4) 121 | conf_t = torch.LongTensor(1, self.priors.shape[0]) 122 | match(self.threshold, torch.Tensor(boxes), torch.Tensor(self.priors).float(), self.variance, torch.Tensor(labels), loc_t, conf_t, 0) 123 | boxes, labels = loc_t[0], conf_t[0] 124 | """ 125 | boxes, labels = self.target_transform(boxes, labels) 126 | return image, np.concatenate((boxes, np.expand_dims(labels, -1)), 1) 127 | 128 | 129 | class BoschTestDetection(Dataset): 130 | 131 | def __init__(self): 132 | pass 133 | 134 | def __len__(self): 135 | pass 136 | 137 | def __getitem__(self, index): 138 | pass 139 | em__(self, index): 140 | pass 141 | -------------------------------------------------------------------------------- /dataset/data_preprocessing.py: -------------------------------------------------------------------------------- 1 | from .transforms import * 2 | 3 | 4 | """ 5 | ConvertFromInts(), 6 | PhotometricDistort(), 7 | Expand(self.mean), 8 | RandomSampleCrop(), 9 | RandomMirror(), 10 | """ 11 | 12 | 13 | class TrainAugmentation: 14 | def __init__(self, size, mean=0, std=1.0): 15 | """ 16 | Args: 17 | size: the size the of final image. 18 | mean: mean pixel value per channel. 19 | """ 20 | self.mean = mean 21 | self.size = size 22 | self.augment = Compose([ 23 | ToPercentCoords(), 24 | Resize(self.size), 25 | SubtractMeans(self.mean), 26 | lambda img, boxes=None, labels=None: (img / 255., boxes, labels), 27 | ToTensor(), 28 | ]) 29 | 30 | def __call__(self, img, boxes, labels): 31 | """ 32 | 33 | Args: 34 | img: the output of cv.imread in RGB layout. 35 | boxes: boundding boxes in the form of (x1, y1, x2, y2). 36 | labels: labels of boxes. 37 | """ 38 | return self.augment(img, boxes, labels) 39 | 40 | 41 | class TestTransform: 42 | def __init__(self, size, mean=0.0, std=1.0): 43 | self.transform = Compose([ 44 | ToPercentCoords(), 45 | Resize(size), 46 | SubtractMeans(mean), 47 | lambda img, boxes=None, labels=None: (img / std, boxes, labels), 48 | ToTensor(), 49 | ]) 50 | 51 | def __call__(self, image, boxes, labels): 52 | return self.transform(image, boxes, labels) 53 | 54 | 55 | class PredictionTransform: 56 | def __init__(self, size, mean=0.0, std=1.0): 57 | self.transform = Compose([ 58 | Resize(size), 59 | SubtractMeans(mean), 60 | lambda img, boxes=None, labels=None: (img / std, boxes, labels), 61 | ToTensor() 62 | ]) 63 | 64 | def __call__(self, image): 65 | image, _, _ = self.transform(image) 66 | return image -------------------------------------------------------------------------------- /dataset/transforms.py: -------------------------------------------------------------------------------- 1 | # from https://github.com/amdegroot/ssd.pytorch 2 | 3 | 4 | import torch 5 | from torchvision import transforms 6 | import cv2 7 | import numpy as np 8 | import types 9 | from numpy import random 10 | 11 | 12 | def intersect(box_a, box_b): 13 | max_xy = np.minimum(box_a[:, 2:], box_b[2:]) 14 | min_xy = np.maximum(box_a[:, :2], box_b[:2]) 15 | inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf) 16 | return inter[:, 0] * inter[:, 1] 17 | 18 | 19 | def jaccard_numpy(box_a, box_b): 20 | """Compute the jaccard overlap of two sets of boxes. The jaccard overlap 21 | is simply the intersection over union of two boxes. 22 | E.g.: 23 | A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) 24 | Args: 25 | box_a: Multiple bounding boxes, Shape: [num_boxes,4] 26 | box_b: Single bounding box, Shape: [4] 27 | Return: 28 | jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]] 29 | """ 30 | inter = intersect(box_a, box_b) 31 | area_a = ((box_a[:, 2]-box_a[:, 0]) * 32 | (box_a[:, 3]-box_a[:, 1])) # [A,B] 33 | area_b = ((box_b[2]-box_b[0]) * 34 | (box_b[3]-box_b[1])) # [A,B] 35 | union = area_a + area_b - inter 36 | return inter / union # [A,B] 37 | 38 | 39 | class Compose(object): 40 | """Composes several augmentations together. 41 | Args: 42 | transforms (List[Transform]): list of transforms to compose. 43 | Example: 44 | >>> augmentations.Compose([ 45 | >>> transforms.CenterCrop(10), 46 | >>> transforms.ToTensor(), 47 | >>> ]) 48 | """ 49 | 50 | def __init__(self, transforms): 51 | self.transforms = transforms 52 | 53 | def __call__(self, img, boxes=None, labels=None): 54 | for t in self.transforms: 55 | img, boxes, labels = t(img, boxes, labels) 56 | return img, boxes, labels 57 | 58 | 59 | class Lambda(object): 60 | """Applies a lambda as a transform.""" 61 | 62 | def __init__(self, lambd): 63 | assert isinstance(lambd, types.LambdaType) 64 | self.lambd = lambd 65 | 66 | def __call__(self, img, boxes=None, labels=None): 67 | return self.lambd(img, boxes, labels) 68 | 69 | 70 | class ConvertFromInts(object): 71 | def __call__(self, image, boxes=None, labels=None): 72 | return image.astype(np.float32), boxes, labels 73 | 74 | 75 | class SubtractMeans(object): 76 | def __init__(self, mean): 77 | self.mean = np.array(mean, dtype=np.float32) 78 | 79 | def __call__(self, image, boxes=None, labels=None): 80 | image = image.astype(np.float32) 81 | image -= self.mean 82 | return image.astype(np.float32), boxes, labels 83 | 84 | 85 | class ToAbsoluteCoords(object): 86 | def __call__(self, image, boxes=None, labels=None): 87 | height, width, channels = image.shape 88 | boxes[:, 0] *= width 89 | boxes[:, 2] *= width 90 | boxes[:, 1] *= height 91 | boxes[:, 3] *= height 92 | 93 | return image, boxes, labels 94 | 95 | 96 | class ToPercentCoords(object): 97 | def __call__(self, image, boxes=None, labels=None): 98 | height, width, channels = image.shape 99 | boxes[:, 0] /= width 100 | boxes[:, 2] /= width 101 | boxes[:, 1] /= height 102 | boxes[:, 3] /= height 103 | 104 | return image, boxes, labels 105 | 106 | 107 | class Resize(object): 108 | def __init__(self, size=300): 109 | self.size = size 110 | 111 | def __call__(self, image, boxes=None, labels=None): 112 | image = cv2.resize(image, (self.size, 113 | self.size)) 114 | return image, boxes, labels 115 | 116 | 117 | class RandomSaturation(object): 118 | def __init__(self, lower=0.5, upper=1.5): 119 | self.lower = lower 120 | self.upper = upper 121 | assert self.upper >= self.lower, "contrast upper must be >= lower." 122 | assert self.lower >= 0, "contrast lower must be non-negative." 123 | 124 | def __call__(self, image, boxes=None, labels=None): 125 | if random.randint(2): 126 | image[:, :, 1] *= random.uniform(self.lower, self.upper) 127 | 128 | return image, boxes, labels 129 | 130 | 131 | class RandomHue(object): 132 | def __init__(self, delta=18.0): 133 | assert delta >= 0.0 and delta <= 360.0 134 | self.delta = delta 135 | 136 | def __call__(self, image, boxes=None, labels=None): 137 | if random.randint(2): 138 | image[:, :, 0] += random.uniform(-self.delta, self.delta) 139 | image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0 140 | image[:, :, 0][image[:, :, 0] < 0.0] += 360.0 141 | return image, boxes, labels 142 | 143 | 144 | class RandomLightingNoise(object): 145 | def __init__(self): 146 | self.perms = ((0, 1, 2), (0, 2, 1), 147 | (1, 0, 2), (1, 2, 0), 148 | (2, 0, 1), (2, 1, 0)) 149 | 150 | def __call__(self, image, boxes=None, labels=None): 151 | if random.randint(2): 152 | swap = self.perms[random.randint(len(self.perms))] 153 | shuffle = SwapChannels(swap) # shuffle channels 154 | image = shuffle(image) 155 | return image, boxes, labels 156 | 157 | 158 | class ConvertColor(object): 159 | def __init__(self, current, transform): 160 | self.transform = transform 161 | self.current = current 162 | 163 | def __call__(self, image, boxes=None, labels=None): 164 | if self.current == 'BGR' and self.transform == 'HSV': 165 | image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) 166 | elif self.current == 'RGB' and self.transform == 'HSV': 167 | image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV) 168 | elif self.current == 'BGR' and self.transform == 'RGB': 169 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 170 | elif self.current == 'HSV' and self.transform == 'BGR': 171 | image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) 172 | elif self.current == 'HSV' and self.transform == "RGB": 173 | image = cv2.cvtColor(image, cv2.COLOR_HSV2RGB) 174 | else: 175 | raise NotImplementedError 176 | return image, boxes, labels 177 | 178 | 179 | class RandomContrast(object): 180 | def __init__(self, lower=0.5, upper=1.5): 181 | self.lower = lower 182 | self.upper = upper 183 | assert self.upper >= self.lower, "contrast upper must be >= lower." 184 | assert self.lower >= 0, "contrast lower must be non-negative." 185 | 186 | # expects float image 187 | def __call__(self, image, boxes=None, labels=None): 188 | if random.randint(2): 189 | alpha = random.uniform(self.lower, self.upper) 190 | image *= alpha 191 | return image, boxes, labels 192 | 193 | 194 | class RandomBrightness(object): 195 | def __init__(self, delta=32): 196 | assert delta >= 0.0 197 | assert delta <= 255.0 198 | self.delta = delta 199 | 200 | def __call__(self, image, boxes=None, labels=None): 201 | if random.randint(2): 202 | delta = random.uniform(-self.delta, self.delta) 203 | image += delta 204 | return image, boxes, labels 205 | 206 | 207 | class ToCV2Image(object): 208 | def __call__(self, tensor, boxes=None, labels=None): 209 | return tensor.cpu().numpy().astype(np.float32).transpose((1, 2, 0)), boxes, labels 210 | 211 | 212 | class ToTensor(object): 213 | def __call__(self, cvimage, boxes=None, labels=None): 214 | return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), boxes, labels 215 | 216 | 217 | class RandomSampleCrop(object): 218 | """Crop 219 | Arguments: 220 | img (Image): the image being input during training 221 | boxes (Tensor): the original bounding boxes in pt form 222 | labels (Tensor): the class labels for each bbox 223 | mode (float tuple): the min and max jaccard overlaps 224 | Return: 225 | (img, boxes, classes) 226 | img (Image): the cropped image 227 | boxes (Tensor): the adjusted bounding boxes in pt form 228 | labels (Tensor): the class labels for each bbox 229 | """ 230 | def __init__(self): 231 | self.sample_options = ( 232 | # using entire original input image 233 | None, 234 | # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9 235 | (0.1, None), 236 | (0.3, None), 237 | (0.7, None), 238 | (0.9, None), 239 | # randomly sample a patch 240 | (None, None), 241 | ) 242 | 243 | def __call__(self, image, boxes=None, labels=None): 244 | height, width, _ = image.shape 245 | while True: 246 | # randomly choose a mode 247 | mode = random.choice(self.sample_options) 248 | if mode is None: 249 | return image, boxes, labels 250 | 251 | min_iou, max_iou = mode 252 | if min_iou is None: 253 | min_iou = float('-inf') 254 | if max_iou is None: 255 | max_iou = float('inf') 256 | 257 | # max trails (50) 258 | for _ in range(50): 259 | current_image = image 260 | 261 | w = random.uniform(0.3 * width, width) 262 | h = random.uniform(0.3 * height, height) 263 | 264 | # aspect ratio constraint b/t .5 & 2 265 | if h / w < 0.5 or h / w > 2: 266 | continue 267 | 268 | left = random.uniform(width - w) 269 | top = random.uniform(height - h) 270 | 271 | # convert to integer rect x1,y1,x2,y2 272 | rect = np.array([int(left), int(top), int(left+w), int(top+h)]) 273 | 274 | # calculate IoU (jaccard overlap) b/t the cropped and gt boxes 275 | overlap = jaccard_numpy(boxes, rect) 276 | 277 | # is min and max overlap constraint satisfied? if not try again 278 | if overlap.min() < min_iou and max_iou < overlap.max(): 279 | continue 280 | 281 | # cut the crop from the image 282 | current_image = current_image[rect[1]:rect[3], rect[0]:rect[2], 283 | :] 284 | 285 | # keep overlap with gt box IF center in sampled patch 286 | centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0 287 | 288 | # mask in all gt boxes that above and to the left of centers 289 | m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1]) 290 | 291 | # mask in all gt boxes that under and to the right of centers 292 | m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1]) 293 | 294 | # mask in that both m1 and m2 are true 295 | mask = m1 * m2 296 | 297 | # have any valid boxes? try again if not 298 | if not mask.any(): 299 | continue 300 | 301 | # take only matching gt boxes 302 | current_boxes = boxes[mask, :].copy() 303 | 304 | # take only matching gt labels 305 | current_labels = labels[mask] 306 | 307 | # should we use the box left and top corner or the crop's 308 | current_boxes[:, :2] = np.maximum(current_boxes[:, :2], 309 | rect[:2]) 310 | # adjust to crop (by substracting crop's left,top) 311 | current_boxes[:, :2] -= rect[:2] 312 | 313 | current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:], 314 | rect[2:]) 315 | # adjust to crop (by substracting crop's left,top) 316 | current_boxes[:, 2:] -= rect[:2] 317 | 318 | return current_image, current_boxes, current_labels 319 | 320 | 321 | class Expand(object): 322 | def __init__(self, mean): 323 | self.mean = mean 324 | 325 | def __call__(self, image, boxes, labels): 326 | if random.randint(2): 327 | return image, boxes, labels 328 | 329 | height, width, depth = image.shape 330 | ratio = random.uniform(1, 4) 331 | left = random.uniform(0, width*ratio - width) 332 | top = random.uniform(0, height*ratio - height) 333 | 334 | expand_image = np.zeros( 335 | (int(height*ratio), int(width*ratio), depth), 336 | dtype=image.dtype) 337 | expand_image[:, :, :] = self.mean 338 | expand_image[int(top):int(top + height), 339 | int(left):int(left + width)] = image 340 | image = expand_image 341 | 342 | boxes = boxes.copy() 343 | boxes[:, :2] += (int(left), int(top)) 344 | boxes[:, 2:] += (int(left), int(top)) 345 | 346 | return image, boxes, labels 347 | 348 | 349 | class RandomMirror(object): 350 | def __call__(self, image, boxes, classes): 351 | _, width, _ = image.shape 352 | if random.randint(2): 353 | image = image[:, ::-1] 354 | boxes = boxes.copy() 355 | boxes[:, 0::2] = width - boxes[:, 2::-2] 356 | return image, boxes, classes 357 | 358 | 359 | class SwapChannels(object): 360 | """Transforms a tensorized image by swapping the channels in the order 361 | specified in the swap tuple. 362 | Args: 363 | swaps (int triple): final order of channels 364 | eg: (2, 1, 0) 365 | """ 366 | 367 | def __init__(self, swaps): 368 | self.swaps = swaps 369 | 370 | def __call__(self, image): 371 | """ 372 | Args: 373 | image (Tensor): image tensor to be transformed 374 | Return: 375 | a tensor with channels swapped according to swap 376 | """ 377 | # if torch.is_tensor(image): 378 | # image = image.data.cpu().numpy() 379 | # else: 380 | # image = np.array(image) 381 | image = image[:, :, self.swaps] 382 | return image 383 | 384 | 385 | class PhotometricDistort(object): 386 | def __init__(self): 387 | self.pd = [ 388 | RandomContrast(), # RGB 389 | ConvertColor(current="RGB", transform='HSV'), # HSV 390 | RandomSaturation(), # HSV 391 | RandomHue(), # HSV 392 | ConvertColor(current='HSV', transform='RGB'), # RGB 393 | RandomContrast() # RGB 394 | ] 395 | self.rand_brightness = RandomBrightness() 396 | self.rand_light_noise = RandomLightingNoise() 397 | 398 | def __call__(self, image, boxes, labels): 399 | im = image.copy() 400 | im, boxes, labels = self.rand_brightness(im, boxes, labels) 401 | if random.randint(2): 402 | distort = Compose(self.pd[:-1]) 403 | else: 404 | distort = Compose(self.pd[1:]) 405 | im, boxes, labels = distort(im, boxes, labels) 406 | return self.rand_light_noise(im, boxes, labels) 407 | 408 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | import torch.backends.cudnn as cudnn 5 | import torch.nn.functional as F 6 | import torchvision.transforms as transforms 7 | import torchvision 8 | from torch.nn import init 9 | from torch.utils.data import DataLoader 10 | from torch.autograd import Variable 11 | 12 | import numpy as np 13 | 14 | import visdom 15 | import logging 16 | 17 | import os, sys, time, io, datetime 18 | import random 19 | import argparse 20 | 21 | from network.RefineDet import RefineDetArm, RefineDetObm 22 | from network.FlowNet import FlowNetS 23 | from network.network_utils import EmbeddingNetwork, L2Norm 24 | from network.MultiBoxLoss import RefineMultiBoxLoss 25 | from dataset.bosch import BoschTrainDetection, BoschTestDetection, detection_collate 26 | from dataset.data_preprocessing import TrainAugmentation, TestTransform 27 | from network.prior_box import PriorBox 28 | 29 | def reindex_tensor(input_tensor, input_index): 30 | if isinstance(input_index, list): 31 | input_index = torch.Tensor(input_index) 32 | elif isinstance(input_index, np.array): 33 | input_index = torch.from_numpy(input_index) 34 | input_index = input_index.long() 35 | per_batch_length = np.prod(np.array(input_tensor.size())[1: ]) 36 | expand_index = input_index.unsqueeze(-1).repeat(1, per_batch_length).view(input_tensor.size()) 37 | return torch.gather(input_tensor, 0, expand_index) 38 | 39 | def reindex(array, index): 40 | return [array[item] for item in index] 41 | 42 | def str2bool(v): 43 | return v.lower() in ("yes", "true", "1", "t") 44 | 45 | """ 46 | def cosine_similarity(preceed, current): 47 | preceed_vector = EmbeddingNetwork(preceed) 48 | current_vector = EmbeddingNetwork(current) 49 | preceed_vector_sum_sqrt = torch.sqrt(torch.sum(torch.pow(preceed_vector, 2), 1)) 50 | current_vector_sum_sqrt = torch.sqrt(torch.sum(torch.pow(current_vector, 2), 1)) 51 | 52 | return torch.exp(preceed_vector * current_vector / (preceed_vector_sum_sqrt * current_vector_sum_sqrt)) 53 | """ 54 | 55 | # input: [batch_size, channels, height, width], gather_index: [batch_index, 2, height, width] 56 | def gather_nd(input, gather_index): 57 | input.cuda() 58 | gather_index.cuda() 59 | base_index_x, base_index_y = torch.meshgrid([torch.arange(input.size()[2]), torch.arange(input.size()[3])]) 60 | base_index = torch.stack([base_index_x, base_index_y], -1).view(input.size()[2], input.size()[3], 2) 61 | base_index = torch.stack([base_index for _ in range(input.size()[0])]).double() 62 | 63 | input = input.permute(0, 2, 3, 1).contiguous().double() 64 | gather_index = gather_index.permute(0, 2, 3, 1).contiguous().double() 65 | gather_index = base_index + gather_index 66 | gather_index = gather_index.view(-1, 2).double() 67 | clamp_gather_index = torch.DoubleTensor(gather_index.size()).cuda() 68 | clamp_gather_index[:, 0] = torch.clamp(gather_index[:, 0], 0., float(input.size()[1] - 1)).double() 69 | clamp_gather_index[:, 1] = torch.clamp(gather_index[:, 1], 0., float(input.size()[2] - 1)).double() 70 | gather_index_ceil = torch.ceil(clamp_gather_index).double() 71 | gather_index_floor = torch.floor(clamp_gather_index).double() 72 | 73 | output = [] 74 | for i in range(gather_index.size()[0]): 75 | batch_index = i // (input.size()[1] * input.size()[1]) 76 | 77 | cor_x, cor_y = clamp_gather_index[i][0], clamp_gather_index[i][1] 78 | cor_x_ceil, cor_y_ceil = gather_index_ceil[i][0], gather_index_ceil[i][1] 79 | cor_x_floor, cor_y_floor = gather_index_floor[i][0], gather_index_floor[i][1] 80 | weight_ceil_x, weight_ceil_y = cor_x - cor_x_floor, cor_y - cor_y_floor 81 | weight_floor_x, weight_floor_y = cor_x_ceil - cor_x, cor_y_ceil - cor_y 82 | 83 | output_ceil = input[batch_index, cor_x_ceil.int(), cor_y_ceil.int()] 84 | output_floor = input[batch_index, cor_x_floor.int(), cor_y_floor.int()] 85 | output_y_ceil = weight_ceil_x * input[batch_index, cor_x_ceil.int(), cor_y_ceil.int()] + weight_floor_x * input[batch_index, cor_x_floor.int(), cor_y_ceil.int()] 86 | output_y_floor = weight_ceil_x * input[batch_index, cor_x_ceil.int(), cor_y_floor.int()] + weight_floor_x * input[batch_index, cor_x_floor.int(), cor_y_floor.int()] 87 | output.append(weight_ceil_y * output_y_ceil + weight_floor_y * output_y_floor) 88 | 89 | result = torch.stack(output, 0).view(tuple(input.size())).permute(0, 3, 1, 2).contiguous().float() 90 | 91 | return result 92 | 93 | # TODO: set batch_size=1 may cause error from cosine similarity part, check unsqueeze afterwards 94 | parser = argparse.ArgumentParser() 95 | 96 | parser.add_argument('-v', '--version', default='RefineDet', help='feature network') 97 | parser.add_argument('-s', '--size', default=320, help='320 or 512 input size') 98 | parser.add_argument('-d', '--dataset', default='Bosch', help='Cityscapes, ImageNet VID, Bosch or Sensetime') 99 | parser.add_argument('-b', '--batch_size', default=4, type=int, help='batch size') 100 | 101 | parser.add_argument('--jaccard_threshold', default=0.5, type=float, help='min jaccard index for matching') 102 | parser.add_argument('--num_workers', default=0, type=int, help='number of workers in dataloading') 103 | parser.add_argument('--cuda', default=True, type=bool, help='use cuda') 104 | parser.add_argument('--gpu_id', default=0, type=int, help='gpu list') 105 | parser.add_argument('--lr', '--learning_rate ', default=1e-4, help='initial learning rate') 106 | parser.add_argument('--base_lr', default=1e-8, help='base feature network learning rate') 107 | parser.add_argument('--flownet_lr', default=1e-8, help='flownet learning rate') 108 | parser.add_argument('--momentum', default=0.9, type=float, help='momentum') 109 | 110 | parser.add_argument('--feature_basenet', default="models/vgg16_reducedfc.pth", help='feature network pretrained model') 111 | parser.add_argument('--flow_basenet', default="models/FlowNet2-S_checkpoint.pth.tar", help='flow network pretrained model') 112 | parser.add_argument('--dataset_path', default="/mnt/lustre/zhoukeyang/Bosch_Dataset") 113 | 114 | parser.add_argument('--debug_step', default=100, help='debug step') 115 | parser.add_argument('--q_threshold', default=0., help='q threshold') 116 | parser.add_argument('--use_aggr', default=True, help='use aggregation') 117 | parser.add_argument('--use_partial', default=True, help='use partial feature updating') 118 | parser.add_argument('--mask_loss_weight', default=1., type=float, help='update mask loss weight') 119 | 120 | parser.add_argument('--resume', default=False, help='resume net for retraining') 121 | parser.add_argument('--resume_epoch', default=0, type=int, help='resume iteration for retraining') 122 | 123 | parser.add_argument('--max_epoch', default=300, type=int, help='max epoch for retraining') 124 | parser.add_argument('--valid_iter', default=1, type=int, help='epoch that print the loss') 125 | parser.add_argument('--save_folder', default='models', type=str, help='location to save checkpoint') 126 | parser.add_argument('--save_iter', default=3, type=int, help='epoch that save the model') 127 | parser.add_argument('--visdom', default=False, help='use visualization') 128 | parser.add_argument('--num_classes', default=14, help='num classes') 129 | 130 | parser.add_argument('--is_training', default=True, type=bool, help='training or validating') 131 | 132 | args = parser.parse_args() 133 | logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 134 | DEVICE = torch.device("cuda:0" if torch.cuda.is_available() and args.cuda else "cpu") 135 | 136 | VOC_320 = { 137 | 'feature_maps': [40, 20, 10, 5], 138 | 'min_dim': 320, 139 | 'steps': [8, 16, 32, 64], 140 | 'min_sizes': [32, 64, 128, 256], 141 | 'max_sizes': [], 142 | 'aspect_ratios': [[2], [2], [2], [2]], 143 | 'variance': [0.1, 0.2], 144 | 'clip': True, 145 | } 146 | 147 | 148 | def test(dataloader, refinedet_arm, refinedet_obm, flownetS, arm_criterion, obm_criterion, device): 149 | refinedet_arm.eval(True) 150 | refinedet_obm.eval(True) 151 | flownetS.eval(True) 152 | 153 | 154 | def train(dataloader, refinedet_arm, refinedet_obm, flownetS, arm_criterion, obm_criterion, optimizer, device, epoch): 155 | refinedet_arm.train(True) 156 | refinedet_obm.train(True) 157 | flownetS.train(True) 158 | 159 | previous_images, previous_arm_conf, previous_arm_loc = None, None, None 160 | preceed_out, preceed_featuremap, preceed_conf, preceed_loc, preceed_input = [], [], [], [], [] 161 | 162 | running_loss = 0. 163 | running_arm_regression_loss = 0. 164 | running_arm_classification_loss = 0. 165 | running_obm_regression_loss = 0. 166 | running_obm_classification_loss = 0. 167 | running_updating_mask_loss = 0. 168 | 169 | for i, data in enumerate(dataloader): 170 | previous_images, previous_arm_conf, previous_arm_loc = None, None, None 171 | preceed_out, preceed_featuremap, preceed_conf, preceed_loc, preceed_input = [], [], [], [], [] 172 | 173 | corresponding_key = [] 174 | for batch_index in range(args.batch_size): 175 | corresponding_key.append(random.randint(0, batch_index)) 176 | images, targets = data[0], data[1] 177 | images = images.to(device) 178 | #boxes = boxes.to(device) 179 | #labels = labels.to(device) 180 | preceed_input = images 181 | 182 | #preceed_images = torch.stack((torch.unbind(preceed_images, 0)[corresponding_key]), 0).to(device) 183 | #preceed_images = torch.stack(reindex(torch.unbind(preceed_input, 0), corresponding_key), 0).to(device) 184 | preceed_images = reindex_tensor(preceed_input, corresponding_key) 185 | 186 | optimizer.zero_grad() 187 | 188 | # arm_sources 512, 512, 1024, 512 189 | out, arm_sources, arm_conf, arm_loc = refinedet_arm(images) 190 | images_stack = torch.cat((images, preceed_images), 1) 191 | images_stack = F.interpolate(images_stack, size=(256, 256), mode='bilinear') 192 | flow_result, flow_list, q_propagate = flownetS(images_stack) 193 | 194 | preceed_out.append(out) 195 | preceed_featuremap.append(arm_sources) 196 | preceed_conf = arm_conf 197 | preceed_loc = arm_loc 198 | arm_out = out 199 | 200 | updating_mask = torch.zeros(size=(1, )) 201 | 202 | # enforce q_propagate equals to 1 and 0 with 1/3 probability add later... 203 | if args.use_partial: 204 | updating_mask = torch.clamp(q_propagate - args.q_threshold + 0.5, 0., 1.) 205 | prop_condition = 1 - updating_mask 206 | prop_condition = prop_condition.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1) 207 | 208 | new_arm_sources = [] 209 | 210 | for j, arm_source_item in enumerate(arm_sources): 211 | flow_map = F.interpolate(flow_result, size=tuple(arm_source_item.size())[2:], mode='bilinear') * float(arm_source_item.size()[2] / flow_result.size()[2]) 212 | #flow_map *= float(arm_source_item.size()[2] / flow_result.size()[2]) 213 | new_preceed = gather_nd(reindex_tensor(arm_source_item, corresponding_key), flow_map) 214 | new_arm_sources.append(prop_condition * new_preceed + (1. - prop_condition) * arm_source_item) 215 | #new_arm_sources.append(prop_condition * gather_nd(torch.stack(reindex(torch.unbind(arm_source_item, 0), corresponding_key), 0), flow_map) + (1. - prop_condition) * arm_source_item) 216 | """ 217 | arm_conf_original_shape = arm_conf.size() 218 | arm_loc_original_shape = arm_loc.size() 219 | 220 | arm_conf_shape = list(flow_result.size()) 221 | arm_conf_shape[1] = 2 222 | arm_loc_shape = list(flow_result.size()) 223 | arm_loc_shape[1] = 2 224 | 225 | arm_conf = arm_conf.view(arm_conf_shape) 226 | arm_loc = arm_loc.view(arm_loc_shape) 227 | arm_conf = prop_condition * gather_nd(torch.stack(reindex(torch.unbind(arm_conf, 0), corresponding_key), 0), flow_result) + (1. - prop_condition) * arm_conf 228 | arm_loc = prop_condition * gather_nd(torch.stack(reindex(torch.unbind(arm_loc, 0), corresponding_key), 0), flow_result) + (1. - prop_condition) * arm_loc 229 | 230 | arm_conf = arm_conf.view(arm_conf_original_shape) 231 | arm_loc = arm_loc.view(arm_loc_original_shape) 232 | """ 233 | arm_out = new_arm_sources[-1] 234 | arm_sources = new_arm_sources 235 | 236 | # warp operation add later... 237 | if args.use_aggr: 238 | def cosine_similarity(preceed, current, Embedding): 239 | preceed_vector = Embedding(preceed) 240 | current_vector = Embedding(current) 241 | preceed_vector_sum_sqrt = torch.sqrt(torch.sum(torch.pow(preceed_vector, 2), -1)) 242 | current_vector_sum_sqrt = torch.sqrt(torch.sum(torch.pow(current_vector, 2), -1)) 243 | 244 | return torch.exp(torch.sum(preceed_vector * current_vector, 1) / (preceed_vector_sum_sqrt * current_vector_sum_sqrt)).float() 245 | 246 | preceed = [] 247 | new_arm_sources = [] 248 | new_preceed = [] 249 | for item in arm_sources: 250 | #preceed.append(torch.stack(reindex(torch.unbind(item, 0), corresponding_key), 0)) 251 | preceed.append(reindex_tensor(item, corresponding_key)) 252 | succeed = arm_sources 253 | 254 | # wrap key frame using flow result 255 | for j, preceed_map in enumerate(preceed): 256 | flow_map = F.interpolate(flow_result, size=tuple(preceed_map.size())[2: ], mode='bilinear') 257 | flow_map *= (preceed_map.size()[2] / flow_result.size()[2]) 258 | 259 | propagate_similarity = cosine_similarity(preceed[j].float(), succeed[j].float(), EmbeddingNetwork[j]) 260 | propagate_similarity = propagate_similarity / (1. + propagate_similarity) 261 | self_similarity = 1. - propagate_similarity 262 | propagate_similarity, self_similarity = torch.mean(propagate_similarity), torch.mean(self_similarity) 263 | 264 | new_preceed.append(gather_nd(preceed_map, flow_map)) 265 | new_arm_sources.append(self_similarity * succeed[j] + propagate_similarity * preceed[j]) 266 | arm_out = new_arm_sources[-1] 267 | arm_sources = new_arm_sources 268 | 269 | obm_out = refinedet_obm(arm_out, arm_sources, arm_conf, arm_loc, is_training=True) 270 | feature_layer, arm_conf, arm_loc, obm_conf, obm_loc = obm_out 271 | 272 | #labels = torch.unsqueeze(labels, -1) 273 | #arm_targets = torch.cat((boxes, torch.gt(labels, 0).float()), -1) 274 | #obm_targets = torch.cat((boxes, labels.float()), -1) 275 | arm_regression_loss, arm_classification_loss = arm_criterion((arm_loc, arm_conf), priors, targets) 276 | obm_regression_loss, obm_classification_loss = obm_criterion((obm_loc, obm_conf), priors, targets, (arm_loc, arm_conf), False) 277 | 278 | arm_detection_loss = (arm_regression_loss + arm_classification_loss).double() 279 | obm_detection_loss = (obm_regression_loss + obm_classification_loss).double() 280 | update_mask_loss = torch.sum(updating_mask).double() 281 | loss = arm_detection_loss + obm_detection_loss + update_mask_loss 282 | 283 | #loss.backward() 284 | #optimizer.step() 285 | 286 | running_loss += loss.item() 287 | running_arm_regression_loss += arm_regression_loss.item() 288 | running_arm_classification_loss += arm_classification_loss.item() 289 | running_obm_regression_loss += obm_regression_loss.item() 290 | running_obm_classification_loss += obm_classification_loss.item() 291 | running_updating_mask_loss += update_mask_loss.item() 292 | 293 | if i and i % args.debug_step == 0: 294 | avg_loss = running_loss / args.debug_step 295 | avg_arm_reg_loss = running_arm_regression_loss / args.debug_step 296 | avg_arm_clf_loss = running_arm_classification_loss / args.debug_step 297 | avg_obm_reg_loss = running_obm_regression_loss / args.debug_step 298 | avg_obm_clf_loss = running_obm_classification_loss / args.debug_step 299 | avg_update_mask_loss = running_updating_mask_loss / args.debug_step 300 | print("Epoch: {}, Step: {}, Avg loss: {}, Avg arm loss: {}, Avg obm loss: {}, Update mask loss: {}" 301 | .format(epoch, i, avg_loss, avg_arm_reg_loss + avg_arm_clf_loss, avg_obm_reg_loss + avg_obm_clf_loss, avg_update_mask_loss)) 302 | running_loss, running_arm_regression_loss, running_arm_classification_loss, running_obm_regression_loss, running_obm_classification_loss, running_updating_mask_loss = 0., 0., 0., 0. 303 | 304 | 305 | if __name__ == "__main__": 306 | save_folder = args.save_folder + "_" + datetime.datetime.now().strftime('%Y%m%d%H%M%S') 307 | if not os.path.exists(save_folder): 308 | os.mkdir(save_folder) 309 | 310 | if args.visdom: 311 | viz = visdom.Visdom() 312 | 313 | cfg = VOC_320 314 | priorbox = PriorBox(cfg) 315 | priors = priorbox.forward() 316 | #priors = Variable(priorbox.forward(), volatile=True) 317 | 318 | train_transform = TrainAugmentation(args.size, np.array([123, 117, 104]), 128.) 319 | train_dataset = BoschTrainDetection(root_dir=args.dataset_path, \ 320 | yaml_file="train.yaml", transform=train_transform, target_transform=False) 321 | 322 | print(len(train_dataset)) 323 | 324 | label_file = os.path.join(save_folder, "bosch-dataset-labels.txt") 325 | with open(label_file, "w") as f: 326 | f.write("\n".join(train_dataset.class_names)) 327 | num_classes = len(train_dataset.class_names) 328 | 329 | refinedet_arm = RefineDetArm(vgg_type='300', in_channels=3, batch_norm=args.is_training) 330 | refinedet_obm = RefineDetObm(num_classes=num_classes, batch_norm=args.is_training) 331 | flownetS = FlowNetS(in_channels=6, is_training=True) 332 | 333 | EmbeddingCosine1 = EmbeddingNetwork(512) 334 | EmbeddingCosine2 = EmbeddingNetwork(512) 335 | EmbeddingCosine3 = EmbeddingNetwork(1024) 336 | EmbeddingCosine4 = EmbeddingNetwork(512) 337 | 338 | EmbeddingNetwork = [EmbeddingCosine1, EmbeddingCosine2, EmbeddingCosine3, EmbeddingCosine4] 339 | 340 | total_net = nn.ModuleList( 341 | [ 342 | refinedet_arm, 343 | refinedet_obm, 344 | flownetS, 345 | ] 346 | ) 347 | 348 | train_dataloader = DataLoader(train_dataset, args.batch_size, num_workers=args.num_workers, shuffle=False, collate_fn=detection_collate) 349 | 350 | if not args.resume: 351 | def xavier(param): 352 | init.xavier_uniform(param) 353 | 354 | def weights_init(m): 355 | for key in m.state_dict(): 356 | if key.split('.')[-1] == 'weight': 357 | if 'conv' in key: 358 | init.kaiming_normal(m.state_dict()[key], mode='fan_out') 359 | if 'bn' in key: 360 | m.state_dict()[key][...] = 1 361 | elif key.split('.')[-1] == 'bias': 362 | m.state_dict()[key][...] = 0 363 | 364 | refinedet_arm.apply(weights_init) 365 | refinedet_obm.apply(weights_init) 366 | flownetS.apply(weights_init) 367 | for item in EmbeddingNetwork: 368 | item.apply(weights_init) 369 | 370 | feature_weight = torch.load(args.feature_basenet) 371 | flow_weight = torch.load(args.flow_basenet)['state_dict'] 372 | 373 | refinedet_arm.vgg_list.load_state_dict(feature_weight) 374 | 375 | # select and restore parameters partly 376 | flownet_dict = {} 377 | for k, v in flow_weight.items(): 378 | if 'conv' in k: 379 | new_k = k.split('.')[0] + '.' + k.split('.')[-1] 380 | if 'deconv' not in k and 'bias' in k: 381 | continue 382 | elif 'upsample' in k: 383 | new_k = k.split('_')[0] + '_' + k.split('_')[1] + '_' + k.split('_')[3] 384 | else: 385 | new_k = k 386 | flownet_dict[new_k] = v 387 | 388 | flownet_dict['q_propagate.weight'] = flownetS.state_dict()['q_propagate.weight'] 389 | flownet_stat_dict = flownetS.state_dict() 390 | flownet_stat_dict.update(flownet_dict) 391 | flownetS.load_state_dict(flownet_stat_dict) 392 | 393 | vgg_pretrained_list = [] 394 | flownet_pretrained_list = [] 395 | random_list = [] 396 | for name, param in list(total_net.named_parameters()): 397 | if 'vgg_list' in name: 398 | vgg_pretrained_list.append(param) 399 | elif 'predict_flow' in name: 400 | flownet_pretrained_list.append(param) 401 | else: 402 | random_list.append(param) 403 | 404 | else: 405 | resume_path = os.path.join(save_folder, args.resume_epoch + ".pth") 406 | state_dict = torch.load(resume_path) 407 | from collections import OrderedDict 408 | 409 | new_state_dict = OrderedDict() 410 | for k, v in state_dict.items(): 411 | if k[: 7] == "module.": 412 | name = k[7: ] 413 | else: 414 | name = k 415 | new_state_dict[name] = v 416 | total_net.load_state_dict(new_state_dict) 417 | 418 | if args.gpu_id: 419 | refinedet_arm = torch.nn.DataParallel(refinedet_arm, device_ids=args.gpu_id) 420 | refinedet_obm = torch.nn.DataParallel(refinedet_obm, device_ids=args.gpu_id) 421 | flownetS = torch.nn.DataParallel(flownetS, device_ids=args.gpu_id) 422 | 423 | if args.cuda: 424 | refinedet_arm.cuda() 425 | refinedet_obm.cuda() 426 | flownetS.cuda() 427 | cudnn.benchmark = True 428 | 429 | optimizer = optim.Adam( 430 | [ 431 | {"params": vgg_pretrained_list, "lr": args.base_lr}, 432 | {"params": flownet_pretrained_list, "lr": args.flownet_lr}, 433 | {"params": random_list} 434 | ], lr=args.lr) 435 | arm_criterion = RefineMultiBoxLoss(2, 0.5, True, 0, True, 3, 0.5, False) 436 | obm_criterion = RefineMultiBoxLoss(num_classes, 0.5, True, 0, True, 3, 0.5, False) 437 | 438 | priors = torch.Tensor(priors.astype(np.float32)).cpu() 439 | 440 | for epoch in range(args.resume_epoch, args.max_epoch): 441 | train(train_dataloader, refinedet_arm, refinedet_obm, flownetS, arm_criterion, obm_criterion, optimizer, DEVICE, epoch) 442 | torch.save(total_net.state_dict(), os.path.join(save_folder, "Epoch_" + str(epoch) + ".pth")) 443 | 444 | -------------------------------------------------------------------------------- /models/bosch-dataset-labels.txt: -------------------------------------------------------------------------------- 1 | None 2 | Green 3 | GreenStraightLeft 4 | GreenLeft 5 | RedLeft 6 | GreenStraightRight 7 | Red 8 | off 9 | GreenRight 10 | GreenStraight 11 | Yellow 12 | RedRight 13 | RedStraight 14 | RedStraightLeft -------------------------------------------------------------------------------- /network/FlowNet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | from torch.nn import init 5 | 6 | 7 | def conv(in_channels, out_channels, kernel_size=3, stride=1, is_training=True): 8 | if is_training: 9 | return nn.Sequential( 10 | nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=(kernel_size - 1) // 2, bias=False), 11 | nn.BatchNorm2d(out_channels), 12 | nn.LeakyReLU(0.1, inplace=True), 13 | ) 14 | else: 15 | return nn.Sequential( 16 | nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=(kernel_size - 1) // 2, bias=True), 17 | nn.LeakyReLU(0.1, inplace=True), 18 | ) 19 | 20 | 21 | class FlowNetS(nn.Module): 22 | def __init__(self, in_channels=6, is_training=True): 23 | super(FlowNetS, self).__init__() 24 | 25 | self.is_training = is_training 26 | self.in_channels = in_channels 27 | 28 | self.conv1 = nn.Conv2d(in_channels, 64, kernel_size=7, stride=2, padding=3, bias=(not is_training)) 29 | self.batch_norm_1 = nn.BatchNorm2d(64) 30 | self.leaky_relu_1 = nn.LeakyReLU(0.1, inplace=True) 31 | self.conv2 = nn.Conv2d(64, 128, kernel_size=5, stride=2, padding=2, bias=(not is_training)) 32 | self.batch_norm_2 = nn.BatchNorm2d(128) 33 | self.leaky_relu_2 = nn.LeakyReLU(0.1, inplace=True) 34 | self.conv3 = nn.Conv2d(128, 256, kernel_size=5, stride=2, padding=2, bias=(not is_training)) 35 | self.batch_norm_3 = nn.BatchNorm2d(256) 36 | self.leaky_relu_3 = nn.LeakyReLU(0.1, inplace=True) 37 | self.conv3_1 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1, bias=(not is_training)) 38 | self.batch_norm_3_1 = nn.BatchNorm2d(256) 39 | self.leaky_relu_3_1 = nn.LeakyReLU(0.1, inplace=True) 40 | self.conv4 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=(not is_training)) 41 | self.batch_norm_4 = nn.BatchNorm2d(512) 42 | self.leaky_relu_4 = nn.LeakyReLU(0.1, inplace=True) 43 | self.conv4_1 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1, bias=(not is_training)) 44 | self.batch_norm_4_1 = nn.BatchNorm2d(512) 45 | self.leaky_relu_4_1 = nn.LeakyReLU(0.1, inplace=True) 46 | self.conv5 = nn.Conv2d(512, 512, kernel_size=3, stride=2, padding=1, bias=(not is_training)) 47 | self.batch_norm_5 = nn.BatchNorm2d(512) 48 | self.leaky_relu_5 = nn.LeakyReLU(0.1, inplace=True) 49 | self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1, bias=(not is_training)) 50 | self.batch_norm_5_1 = nn.BatchNorm2d(512) 51 | self.leaky_relu_5_1 = nn.LeakyReLU(0.1, inplace=True) 52 | self.conv6 = nn.Conv2d(512, 1024, kernel_size=3, stride=2, padding=1, bias=(not is_training)) 53 | self.batch_norm_6 = nn.BatchNorm2d(1024) 54 | self.leaky_relu_6 = nn.LeakyReLU(0.1, inplace=True) 55 | self.conv6_1 = nn.Conv2d(1024, 1024, kernel_size=3, stride=1, padding=1, bias=(not is_training)) 56 | self.batch_norm_6_1 = nn.BatchNorm2d(1024) 57 | self.leaky_relu_6_1 = nn.LeakyReLU(0.1, inplace=True) 58 | 59 | self.deconv5 = nn.ConvTranspose2d(1024, 512, kernel_size=4, stride=2, padding=1, bias=True) 60 | self.deconv4 = nn.ConvTranspose2d(1026, 256, kernel_size=4, stride=2, padding=1, bias=True) 61 | self.deconv3 = nn.ConvTranspose2d(770, 128, kernel_size=4, stride=2, padding=1, bias=True) 62 | self.deconv2 = nn.ConvTranspose2d(386, 64, kernel_size=4, stride=2, padding=1, bias=True) 63 | 64 | self.predict_flow6 = nn.Conv2d(1024, 2, kernel_size=3, stride=1, padding=1, bias=True) 65 | self.predict_flow5 = nn.Conv2d(1026, 2, kernel_size=3, stride=1, padding=1, bias=True) 66 | self.predict_flow4 = nn.Conv2d(770, 2, kernel_size=3, stride=1, padding=1, bias=True) 67 | self.predict_flow3 = nn.Conv2d(386, 2, kernel_size=3, stride=1, padding=1, bias=True) 68 | self.predict_flow2 = nn.Conv2d(194, 2, kernel_size=3, stride=1, padding=1, bias=True) 69 | 70 | self.upsampled_flow6_5 = nn.ConvTranspose2d(2, 2, kernel_size=4, stride=2, padding=1, bias=False) 71 | self.upsampled_flow5_4 = nn.ConvTranspose2d(2, 2, kernel_size=4, stride=2, padding=1, bias=False) 72 | self.upsampled_flow4_3 = nn.ConvTranspose2d(2, 2, kernel_size=4, stride=2, padding=1, bias=False) 73 | self.upsampled_flow3_2 = nn.ConvTranspose2d(2, 2, kernel_size=4, stride=2, padding=1, bias=False) 74 | 75 | for m in self.modules(): 76 | if isinstance(m, nn.Conv2d): 77 | if m.bias is not None: 78 | init.uniform_(m.bias) 79 | init.xavier_uniform_(m.weight) 80 | 81 | if isinstance(m, nn.ConvTranspose2d): 82 | if m.bias is not None: 83 | init.uniform_(m.bias) 84 | init.xavier_uniform_(m.weight) 85 | 86 | self.q_propagate = nn.Conv2d(2, 1, kernel_size=3, stride=1, padding=1, bias=False) 87 | self.upsampled_q6_1 = nn.Upsample(scale_factor=64, mode='bilinear') 88 | self.upsampled_q5_1 = nn.Upsample(scale_factor=32, mode='bilinear') 89 | self.upsampled_q4_1 = nn.Upsample(scale_factor=16, mode='bilinear') 90 | self.upsampled_q3_1 = nn.Upsample(scale_factor=8, mode='bilinear') 91 | self.upsampled_q2_1 = nn.Upsample(scale_factor=4, mode='bilinear') 92 | self.upsampled_flow2_1 = nn.Upsample(scale_factor=4, mode='bilinear') 93 | 94 | def forward(self, x): 95 | #input = nn.functional.interpolate(x, size=(256, 256), mode='bilinear') 96 | 97 | out_conv1 = self.conv1(x) 98 | if self.is_training: 99 | out_conv1 = self.leaky_relu_1(self.batch_norm_1(out_conv1)) 100 | else: 101 | out_conv1 = self.leaky_relu_1(out_conv1) 102 | 103 | out_conv2 = self.conv2(out_conv1) 104 | if self.is_training: 105 | out_conv2 = self.leaky_relu_2(self.batch_norm_2(out_conv2)) 106 | else: 107 | out_conv2 = self.leaky_relu_2(out_conv2) 108 | 109 | out_conv3 = self.conv3(out_conv2) 110 | if self.is_training: 111 | out_conv3 = self.leaky_relu_3(self.batch_norm_3(out_conv3)) 112 | else: 113 | out_conv3 = self.leaky_relu_3(out_conv3) 114 | 115 | out_conv3 = self.conv3_1(out_conv3) 116 | if self.is_training: 117 | out_conv3 = self.leaky_relu_3_1(self.batch_norm_3_1(out_conv3)) 118 | else: 119 | out_conv3 = self.leaky_relu_3_1(out_conv3) 120 | 121 | out_conv4 = self.conv4(out_conv3) 122 | if self.is_training: 123 | out_conv4 = self.leaky_relu_4(self.batch_norm_4(out_conv4)) 124 | else: 125 | out_conv4 = self.leaky_relu_4(out_conv4) 126 | 127 | out_conv4 = self.conv4_1(out_conv4) 128 | if self.is_training: 129 | out_conv4 = self.leaky_relu_4_1(self.batch_norm_4_1(out_conv4)) 130 | else: 131 | out_conv4 = self.leaky_relu_4_1(out_conv4) 132 | 133 | out_conv5 = self.conv5(out_conv4) 134 | if self.is_training: 135 | out_conv5 = self.leaky_relu_5(self.batch_norm_5(out_conv5)) 136 | else: 137 | out_conv5 = self.leaky_relu_5(out_conv5) 138 | 139 | out_conv5 = self.conv5_1(out_conv5) 140 | if self.is_training: 141 | out_conv5 = self.leaky_relu_5_1(self.batch_norm_5_1(out_conv5)) 142 | else: 143 | out_conv5 = self.leaky_relu_5_1(out_conv5) 144 | 145 | out_conv6 = self.conv6(out_conv5) 146 | if self.is_training: 147 | out_conv6 = self.leaky_relu_6(self.batch_norm_6(out_conv6)) 148 | else: 149 | out_conv6 = self.leaky_relu_6(out_conv6) 150 | 151 | out_conv6 = self.conv6_1(out_conv6) 152 | if self.is_training: 153 | out_conv6 = self.leaky_relu_6_1(self.batch_norm_6_1(out_conv6)) 154 | else: 155 | out_conv6 = self.leaky_relu_6_1(out_conv6) 156 | 157 | 158 | flow6 = self.predict_flow6(out_conv6) 159 | flow6_up = self.upsampled_flow6_5(flow6) 160 | out_deconv5 = self.deconv5(out_conv6) 161 | q_propagate6 = self.upsampled_q6_1(self.q_propagate(flow6)) 162 | 163 | concat5 = torch.cat((out_conv5, out_deconv5, flow6_up), 1) 164 | flow5 = self.predict_flow5(concat5) 165 | flow5_up = self.upsampled_flow5_4(flow5) 166 | out_deconv4 = self.deconv4(concat5) 167 | q_propagate5 = self.upsampled_q5_1(self.q_propagate(flow5)) 168 | 169 | concat4 = torch.cat((out_conv4, out_deconv4, flow5_up), 1) 170 | flow4 = self.predict_flow4(concat4) 171 | flow4_up = self.upsampled_flow4_3(flow4) 172 | out_deconv3 = self.deconv3(concat4) 173 | q_propagate4 = self.upsampled_q4_1(self.q_propagate(flow4)) 174 | 175 | concat3 = torch.cat((out_conv3, out_deconv3, flow4_up), 1) 176 | flow3 = self.predict_flow3(concat3) 177 | flow3_up = self.upsampled_flow3_2(flow3) 178 | out_deconv2 = self.deconv2(concat3) 179 | q_propagate3 = self.upsampled_q3_1(self.q_propagate(flow3)) 180 | 181 | concat2 = torch.cat((out_conv2, out_deconv2, flow3_up), 1) 182 | flow2 = self.predict_flow2(concat2) 183 | q_propagate2 = self.upsampled_q2_1(self.q_propagate(flow2)) 184 | 185 | q_propagate = torch.cat((q_propagate2 * 4, q_propagate3 * 8, q_propagate4 * 16, q_propagate5 * 32, q_propagate6 * 64), 1) 186 | q_propagate = torch.mean(torch.abs(q_propagate), dim=3, keepdim=False) 187 | q_propagate = torch.mean(torch.abs(q_propagate), dim=2, keepdim=False) 188 | q_propagate = torch.mean(torch.abs(q_propagate), dim=1, keepdim=False) 189 | 190 | flow_result = self.upsampled_flow2_1(flow2) 191 | 192 | #flow_result = nn.functional.interpolate(flow_result, size=list(x.size())[2: ], mode='bilinear') 193 | 194 | if self.training: 195 | return flow_result, [flow2, flow3, flow4, flow5, flow6], q_propagate 196 | else: 197 | return flow_result, q_propagate 198 | -------------------------------------------------------------------------------- /network/MultiBoxLoss.py: -------------------------------------------------------------------------------- 1 | # source from: PytorchSSD-master 2 | 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | from .box_utils import match, refine_match, log_sum_exp, decode 9 | GPU = False 10 | if torch.cuda.is_available(): 11 | GPU = True 12 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 13 | 14 | 15 | class RefineMultiBoxLoss(nn.Module): 16 | """SSD Weighted Loss Function 17 | Compute Targets: 18 | 1) Produce Confidence Target Indices by matching ground truth boxes 19 | with (default) 'priorboxes' that have jaccard index > threshold parameter 20 | (default threshold: 0.5). 21 | 2) Produce localization target by 'encoding' variance into offsets of ground 22 | truth boxes and their matched 'priorboxes'. 23 | 3) Hard negative mining to filter the excessive number of negative examples 24 | that comes with using a large number of default bounding boxes. 25 | (default negative:positive ratio 3:1) 26 | Objective Loss: 27 | L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 28 | Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss 29 | weighted by α which is set to 1 by cross val. 30 | Args: 31 | c: class confidences, 32 | l: predicted boxes, 33 | g: ground truth boxes 34 | N: number of matched default boxes 35 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 36 | """ 37 | 38 | def __init__(self, num_classes, overlap_thresh, prior_for_matching, bkg_label, neg_mining, neg_pos, neg_overlap, encode_target, object_score = 0): 39 | super(RefineMultiBoxLoss, self).__init__() 40 | self.num_classes = num_classes 41 | self.threshold = overlap_thresh 42 | self.background_label = bkg_label 43 | self.encode_target = encode_target 44 | self.use_prior_for_matching = prior_for_matching 45 | self.do_neg_mining = neg_mining 46 | self.negpos_ratio = neg_pos 47 | self.neg_overlap = neg_overlap 48 | self.object_score = object_score 49 | self.variance = [0.1, 0.2] 50 | 51 | def forward(self, odm_data, priors, targets, arm_data=None, filter_object=False): 52 | """Multibox Loss 53 | Args: 54 | predictions (tuple): A tuple containing loc preds, conf preds, 55 | and prior boxes from SSD net. 56 | conf shape: torch.size(batch_size,num_priors,num_classes) 57 | loc shape: torch.size(batch_size,num_priors,4) 58 | priors shape: torch.size(num_priors,4) 59 | 60 | ground_truth (tensor): Ground truth boxes and labels for a batch, 61 | shape: [batch_size,num_objs,5] (last idx is the label). 62 | arm_data (tuple): arm branch containg arm_loc and arm_conf 63 | filter_object: whether filter out the prediction according to the arm conf score 64 | """ 65 | 66 | loc_data, conf_data = odm_data 67 | if arm_data: 68 | arm_loc, arm_conf = arm_data 69 | arm_loc = arm_loc.cpu() 70 | arm_conf = arm_conf.cpu() 71 | priors = priors.data 72 | num = loc_data.size(0) 73 | num_priors = (priors.size(0)) 74 | 75 | # match priors (default boxes) and ground truth boxes 76 | loc_t = torch.Tensor(num, num_priors, 4) 77 | conf_t = torch.LongTensor(num, num_priors) 78 | for idx in range(num): 79 | truths = targets[idx][:, :-1].data 80 | labels = targets[idx][:, -1].data 81 | # for object detection 82 | if self.num_classes == 2: 83 | labels = labels > 0 84 | if arm_data: 85 | refine_match(self.threshold, truths, priors, self.variance, labels, loc_t, conf_t, idx, arm_loc[idx].data) 86 | else: 87 | #loc_t[idx, :, :] = truths 88 | #conf_t[idx, :] = labels 89 | match(self.threshold, truths, priors.cpu(), self.variance, labels, loc_t, conf_t, idx) 90 | if GPU: 91 | loc_t = loc_t.cuda() 92 | conf_t = conf_t.cuda() 93 | 94 | # wrap targets 95 | loc_t = Variable(loc_t, requires_grad=False) 96 | conf_t = Variable(conf_t, requires_grad=False) 97 | if arm_data and filter_object: 98 | arm_conf_data = arm_conf.data[:, :, 1] 99 | pos = conf_t > 0 100 | object_score_index = arm_conf_data <= self.object_score 101 | pos[object_score_index] = 0 102 | else: 103 | pos = conf_t > 0 104 | 105 | # Localization Loss (Smooth L1) 106 | # Shape: [batch,num_priors,4] 107 | pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) 108 | loc_p = loc_data[pos_idx].view(-1, 4) 109 | loc_t = loc_t[pos_idx].view(-1, 4) 110 | loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False) 111 | 112 | # Compute max conf across batch for hard negative mining 113 | batch_conf = conf_data.view(-1, self.num_classes) 114 | loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1, 1)) 115 | 116 | # Hard Negative Mining 117 | loss_c = loss_c.view(num, -1) 118 | loss_c[pos] = 0 # filter out pos boxes for now 119 | _, loss_idx = loss_c.sort(1, descending=True) 120 | _, idx_rank = loss_idx.sort(1) 121 | num_pos = pos.long().sum(1, keepdim=True) 122 | num_neg = torch.clamp(self.negpos_ratio * num_pos, max=pos.size(1) - 1) 123 | neg = idx_rank < num_neg.expand_as(idx_rank) 124 | 125 | # Confidence Loss Including Positive and Negative Examples 126 | pos_idx = pos.unsqueeze(2).expand_as(conf_data) 127 | neg_idx = neg.unsqueeze(2).expand_as(conf_data) 128 | conf_p = conf_data[(pos_idx + neg_idx).gt(0)].view(-1, self.num_classes) 129 | targets_weighted = conf_t[(pos + neg).gt(0)] 130 | loss_c = F.cross_entropy(conf_p, targets_weighted, size_average=False) 131 | 132 | # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 133 | N = num_pos.data.sum().double() 134 | loss_l, loss_c = loss_l.double(), loss_c.double() 135 | loss_l /= N 136 | loss_c /= N 137 | return loss_l, loss_c 138 | -------------------------------------------------------------------------------- /network/RefineDet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from .network_utils import L2Norm 5 | 6 | 7 | def vgg_layer_init(vgg_type='300', in_channels=3, batch_norm=True): 8 | vgg_cfg = { 9 | '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 10 | 512, 512, 512], 11 | '512': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 12 | 512, 512, 512], 13 | } 14 | 15 | vgg_list = [] 16 | for layer in vgg_cfg[vgg_type]: 17 | if layer == 'M': 18 | vgg_list += [nn.MaxPool2d(kernel_size=2, stride=2)] 19 | elif layer == 'C': 20 | vgg_list += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 21 | elif batch_norm: 22 | vgg_list += [nn.Conv2d(in_channels, layer, kernel_size=3, padding=1, stride=1)] 23 | vgg_list += [nn.BatchNorm2d(layer)] 24 | vgg_list += [nn.ReLU(inplace=True)] 25 | in_channels = layer 26 | else: 27 | vgg_list += [nn.Conv2d(in_channels, layer, kernel_size=3, padding=1, stride=1)] 28 | vgg_list += [nn.ReLU(inplace=True)] 29 | in_channels = layer 30 | 31 | vgg_list += [nn.MaxPool2d(kernel_size=2, stride=2, padding=0)] 32 | vgg_list += [nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)] 33 | vgg_list += [nn.ReLU(inplace=True)] 34 | vgg_list += [nn.Conv2d(1024, 1024, kernel_size=1)] 35 | vgg_list += [nn.ReLU(inplace=True)] 36 | 37 | return vgg_list 38 | 39 | 40 | class RefineDetArm(nn.Module): 41 | 42 | def __init__(self, vgg_type, in_channels, batch_norm=True): 43 | super(RefineDetArm, self).__init__() 44 | 45 | self.vgg_type = vgg_type 46 | self.in_channels = in_channels 47 | self.batch_norm = batch_norm 48 | 49 | self.vgg_list = nn.ModuleList( 50 | vgg_layer_init(in_channels=self.in_channels, batch_norm=False) 51 | ) 52 | 53 | self.extras = nn.Sequential( 54 | nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0), 55 | nn.ReLU(inplace=True), 56 | nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1), 57 | nn.ReLU(inplace=True), 58 | ) 59 | 60 | self.arm_sources = [] 61 | self.arm_confidences = [] 62 | self.arm_locations = [] 63 | self.arm_conf = None 64 | self.arm_loc = None 65 | 66 | self.arm_location_layers = nn.ModuleList( 67 | [ 68 | nn.Conv2d(512, 12, kernel_size=3, stride=1, padding=1), 69 | nn.Conv2d(512, 12, kernel_size=3, stride=1, padding=1), 70 | nn.Conv2d(1024, 12, kernel_size=3, stride=1, padding=1), 71 | nn.Conv2d(512, 12, kernel_size=3, stride=1, padding=1), 72 | ] 73 | ) 74 | self.arm_confidence_layers = nn.ModuleList( 75 | [ 76 | nn.Conv2d(512, 6, kernel_size=3, stride=1, padding=1), 77 | nn.Conv2d(512, 6, kernel_size=3, stride=1, padding=1), 78 | nn.Conv2d(1024, 6, kernel_size=3, stride=1, padding=1), 79 | nn.Conv2d(512, 6, kernel_size=3, stride=1, padding=1), 80 | ] 81 | ) 82 | 83 | def forward(self, x): 84 | self.arm_sources.clear() 85 | self.arm_locations.clear() 86 | self.arm_confidences.clear() 87 | 88 | for layer in self.vgg_list[: 23]: 89 | x = layer(x) 90 | self.arm_sources.append(x) 91 | 92 | for layer in self.vgg_list[23: 30]: 93 | x = layer(x) 94 | self.arm_sources.append(x) 95 | 96 | for layer in self.vgg_list[30:]: 97 | x = layer(x) 98 | self.arm_sources.append(x) 99 | 100 | x = self.extras(x) 101 | self.arm_sources.append(x) 102 | 103 | for arm_source, arm_confidence_layer, arm_location_layer in zip(self.arm_sources, self.arm_confidence_layers, 104 | self.arm_location_layers): 105 | self.arm_confidences.append(arm_confidence_layer(arm_source).permute(0, 2, 3, 1).contiguous()) 106 | self.arm_locations.append(arm_location_layer(arm_source).permute(0, 2, 3, 1).contiguous()) 107 | # why use transpose and reshape? 108 | arm_conf = torch.cat([item.view(item.size(0), -1) for item in self.arm_confidences], 1) 109 | arm_loc = torch.cat([item.view(item.size(0), -1) for item in self.arm_locations], 1) 110 | 111 | return x, self.arm_sources, arm_conf, arm_loc 112 | 113 | 114 | class RefineDetObm(nn.Module): 115 | 116 | def __init__(self, num_classes, batch_norm=True): 117 | super(RefineDetObm, self).__init__() 118 | 119 | self.last_layer_trans = nn.Sequential( 120 | nn.Conv2d(512, 256, kernel_size=3, stride=1, padding=1), 121 | nn.ReLU(inplace=True), 122 | nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1), 123 | nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1), 124 | ) 125 | 126 | self.L2Norm_4_3 = L2Norm(512, 10) 127 | self.L2Norm_5_3 = L2Norm(512, 8) 128 | 129 | self.obm_sources = [] 130 | self.obm_confidences = [] 131 | self.obm_locations = [] 132 | self.transfer_list = [] 133 | 134 | self.arm_sources = None 135 | self.arm_conf = None 136 | self.arm_loc = None 137 | self.obm_conf = None 138 | self.obm_loc = None 139 | self.trans_result = None 140 | 141 | self.num_classes = num_classes 142 | 143 | self.obm_location_layers = nn.ModuleList( 144 | [ 145 | nn.Conv2d(256, 12, kernel_size=3, stride=1, padding=1), 146 | nn.Conv2d(256, 12, kernel_size=3, stride=1, padding=1), 147 | nn.Conv2d(256, 12, kernel_size=3, stride=1, padding=1), 148 | nn.Conv2d(256, 12, kernel_size=3, stride=1, padding=1), 149 | ] 150 | ) 151 | self.obm_confidence_layers = nn.ModuleList( 152 | [ 153 | nn.Conv2d(256, 3 * self.num_classes, kernel_size=3, stride=1, padding=1), 154 | nn.Conv2d(256, 3 * self.num_classes, kernel_size=3, stride=1, padding=1), 155 | nn.Conv2d(256, 3 * self.num_classes, kernel_size=3, stride=1, padding=1), 156 | nn.Conv2d(256, 3 * self.num_classes, kernel_size=3, stride=1, padding=1), 157 | ] 158 | ) 159 | self.transfer_layers = nn.ModuleList( 160 | [ 161 | nn.Sequential( 162 | nn.Conv2d(512, 256, kernel_size=3, stride=1, padding=1), 163 | nn.ReLU(inplace=True), 164 | nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1), 165 | ), 166 | nn.Sequential( 167 | nn.Conv2d(512, 256, kernel_size=3, stride=1, padding=1), 168 | nn.ReLU(inplace=True), 169 | nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1), 170 | ), 171 | nn.Sequential( 172 | nn.Conv2d(1024, 256, kernel_size=3, stride=1, padding=1), 173 | nn.ReLU(inplace=True), 174 | nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1), 175 | ), 176 | ] 177 | ) 178 | self.upconv_layers = nn.ModuleList( 179 | [ 180 | nn.ConvTranspose2d(256, 256, kernel_size=2, stride=2, padding=0), 181 | nn.ConvTranspose2d(256, 256, kernel_size=2, stride=2, padding=0), 182 | nn.ConvTranspose2d(256, 256, kernel_size=2, stride=2, padding=0), 183 | ] 184 | ) 185 | self.latent_layers = nn.ModuleList( 186 | [ 187 | nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1), 188 | nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1), 189 | nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1), 190 | ] 191 | ) 192 | self.softmax = nn.Softmax() 193 | 194 | # if "is_propagate" is set to True, then "previous_layer" is needed. 195 | # currently simply use previous feature maps 196 | def forward(self, x, arm_sources, arm_conf, arm_loc, is_training=True): 197 | 198 | self.obm_sources.clear() 199 | self.obm_confidences.clear() 200 | self.obm_locations.clear() 201 | self.transfer_list.clear() 202 | 203 | self.arm_sources = arm_sources 204 | self.arm_conf = arm_conf 205 | self.arm_loc = arm_loc 206 | 207 | x = self.last_layer_trans(x) 208 | self.obm_sources.append(x) 209 | 210 | for arm_source, transfer_layer in zip(self.arm_sources, self.transfer_layers): 211 | self.transfer_list.append(transfer_layer(arm_source)) 212 | 213 | self.transfer_list.reverse() 214 | self.arm_sources.reverse() 215 | 216 | for transfer_item, upconv_item, latent_item in zip(self.transfer_list, self.upconv_layers, self.latent_layers): 217 | x = F.relu(latent_item(F.relu(upconv_item(x) + transfer_item, inplace=True)), inplace=True) 218 | self.obm_sources.append(x) 219 | 220 | for obm_source, obm_confidence_layer, obm_location_layer in zip(self.obm_sources, self.obm_confidence_layers, self.obm_location_layers): 221 | self.obm_confidences.append(obm_confidence_layer(obm_source).permute(0, 2, 3, 1).contiguous()) 222 | self.obm_locations.append(obm_location_layer(obm_source).permute(0, 2, 3, 1).contiguous()) 223 | self.obm_conf = torch.cat([item.view(item.size(0), -1) for item in self.obm_confidences], 1) 224 | self.obm_loc = torch.cat([item.view(item.size(0), -1) for item in self.obm_locations], 1) 225 | 226 | output = ( 227 | self.arm_sources[-1], 228 | self.softmax(self.arm_conf.view(self.arm_conf.size(0), -1, 2)), 229 | self.arm_loc.view(self.arm_loc.size(0), -1, 4), 230 | self.softmax(self.obm_conf.view(self.obm_conf.size(0), -1, self.num_classes)), 231 | self.obm_loc.view(self.obm_loc.size(0), -1, 4), 232 | #None, # updating mask 233 | ) 234 | 235 | return output 236 | 237 | -------------------------------------------------------------------------------- /network/__pycache__/FlowNet.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zzzkkkyyy/Video-Object-Detection/bc57ece447620be4b2980daa82b187a391a893a1/network/__pycache__/FlowNet.cpython-36.pyc -------------------------------------------------------------------------------- /network/__pycache__/MultiBoxLoss.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zzzkkkyyy/Video-Object-Detection/bc57ece447620be4b2980daa82b187a391a893a1/network/__pycache__/MultiBoxLoss.cpython-36.pyc -------------------------------------------------------------------------------- /network/__pycache__/RefineDet.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zzzkkkyyy/Video-Object-Detection/bc57ece447620be4b2980daa82b187a391a893a1/network/__pycache__/RefineDet.cpython-36.pyc -------------------------------------------------------------------------------- /network/__pycache__/box_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zzzkkkyyy/Video-Object-Detection/bc57ece447620be4b2980daa82b187a391a893a1/network/__pycache__/box_utils.cpython-36.pyc -------------------------------------------------------------------------------- /network/__pycache__/network_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zzzkkkyyy/Video-Object-Detection/bc57ece447620be4b2980daa82b187a391a893a1/network/__pycache__/network_utils.cpython-36.pyc -------------------------------------------------------------------------------- /network/__pycache__/prior_box.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zzzkkkyyy/Video-Object-Detection/bc57ece447620be4b2980daa82b187a391a893a1/network/__pycache__/prior_box.cpython-36.pyc -------------------------------------------------------------------------------- /network/box_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import math 4 | import numpy as np 5 | if torch.cuda.is_available(): 6 | import torch.backends.cudnn as cudnn 7 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 8 | 9 | 10 | def point_form(boxes): 11 | """ Convert prior_boxes to (xmin, ymin, xmax, ymax) 12 | representation for comparison to point form ground truth data. 13 | Args: 14 | boxes: (tensor) center-size default boxes from priorbox layers. 15 | Return: 16 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. 17 | """ 18 | return torch.cat((boxes[:, :2] - boxes[:, 2:]/2, # xmin, ymin 19 | boxes[:, :2] + boxes[:, 2:]/2), 1) # xmax, ymax 20 | 21 | 22 | def center_size(boxes): 23 | """ Convert prior_boxes to (cx, cy, w, h) 24 | representation for comparison to center-size form ground truth data. 25 | Args: 26 | boxes: (tensor) point_form boxes 27 | Return: 28 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. 29 | """ 30 | return torch.cat([(boxes[:, 2:] + boxes[:, :2])/2, # cx, cy 31 | boxes[:, 2:] - boxes[:, :2]], 1) # w, h 32 | 33 | 34 | def intersect(box_a, box_b): 35 | """ We resize both tensors to [A,B,2] without new malloc: 36 | [A,2] -> [A,1,2] -> [A,B,2] 37 | [B,2] -> [1,B,2] -> [A,B,2] 38 | Then we compute the area of intersect between box_a and box_b. 39 | Args: 40 | box_a: (tensor) bounding boxes, Shape: [A,4]. 41 | box_b: (tensor) bounding boxes, Shape: [B,4]. 42 | Return: 43 | (tensor) intersection area, Shape: [A,B]. 44 | """ 45 | A = box_a.size(0) 46 | B = box_b.size(0) 47 | max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), 48 | box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) 49 | min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), 50 | box_b[:, :2].unsqueeze(0).expand(A, B, 2)) 51 | inter = torch.clamp((max_xy - min_xy), min=0) 52 | return inter[:, :, 0] * inter[:, :, 1] 53 | 54 | 55 | def jaccard(box_a, box_b): 56 | """Compute the jaccard overlap of two sets of boxes. The jaccard overlap 57 | is simply the intersection over union of two boxes. Here we operate on 58 | ground truth boxes and default boxes. 59 | E.g.: 60 | A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) 61 | Args: 62 | box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4] 63 | box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4] 64 | Return: 65 | jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] 66 | """ 67 | inter = intersect(box_a, box_b) 68 | area_a = ((box_a[:, 2]-box_a[:, 0]) * 69 | (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] 70 | area_b = ((box_b[:, 2]-box_b[:, 0]) * 71 | (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] 72 | union = area_a + area_b - inter 73 | return inter / union # [A,B] 74 | 75 | def matrix_iou(a,b): 76 | """ 77 | return iou of a and b, numpy version for data augenmentation 78 | """ 79 | lt = np.maximum(a[:, np.newaxis, :2], b[:, :2]) 80 | rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) 81 | 82 | area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2) 83 | area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) 84 | area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) 85 | return area_i / (area_a[:, np.newaxis] + area_b - area_i) 86 | 87 | 88 | def match(threshold, truths, priors, variances, labels, loc_t, conf_t, idx): 89 | """Match each prior box with the ground truth box of the highest jaccard 90 | overlap, encode the bounding boxes, then return the matched indices 91 | corresponding to both confidence and location preds. 92 | Args: 93 | threshold: (float) The overlap threshold used when mathing boxes. 94 | truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors]. 95 | priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4]. 96 | variances: (tensor) Variances corresponding to each prior coord, 97 | Shape: [num_priors, 4]. 98 | labels: (tensor) All the class labels for the image, Shape: [num_obj]. 99 | loc_t: (tensor) Tensor to be filled w/ endcoded location targets. 100 | conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds. 101 | idx: (int) current batch index 102 | Return: 103 | The matched indices corresponding to 1)location and 2)confidence preds. 104 | """ 105 | # jaccard index 106 | overlaps = jaccard( 107 | truths, 108 | point_form(priors) 109 | ) 110 | # (Bipartite Matching) 111 | # [1,num_objects] best prior for each ground truth 112 | best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True) 113 | # [1,num_priors] best ground truth for each prior 114 | best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True) 115 | best_truth_idx.squeeze_(0) 116 | best_truth_overlap.squeeze_(0) 117 | best_prior_idx.squeeze_(1) 118 | best_prior_overlap.squeeze_(1) 119 | best_truth_overlap.index_fill_(0, best_prior_idx, 2) # ensure best prior 120 | # TODO refactor: index best_prior_idx with long tensor 121 | # ensure every gt matches with its prior of max overlap 122 | for j in range(best_prior_idx.size(0)): 123 | best_truth_idx[best_prior_idx[j]] = j 124 | matches = truths[best_truth_idx] # Shape: [num_priors,4] 125 | conf = labels[best_truth_idx] # Shape: [num_priors] 126 | conf[best_truth_overlap < threshold] = 0 # label as background 127 | loc = encode(matches, priors, variances) 128 | loc_t[idx] = loc # [num_priors,4] encoded offsets to learn 129 | conf_t[idx] = conf # [num_priors] top class label for each prior 130 | 131 | def refine_match(threshold, truths, priors, variances, labels, loc_t, conf_t, idx, arm_loc): 132 | """Match each arm bbox with the ground truth box of the highest jaccard 133 | overlap, encode the bounding boxes, then return the matched indices 134 | corresponding to both confidence and location preds. 135 | Args: 136 | threshold: (float) The overlap threshold used when mathing boxes. 137 | truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors]. 138 | priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4]. 139 | variances: (tensor) Variances corresponding to each prior coord, 140 | Shape: [num_priors, 4]. 141 | labels: (tensor) All the class labels for the image, Shape: [num_obj]. 142 | loc_t: (tensor) Tensor to be filled w/ endcoded location targets. 143 | conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds. 144 | idx: (int) current batch index 145 | arm_loc: (tensor) arm loc data,shape: [n_priors,4] 146 | Return: 147 | The matched indices corresponding to 1)location and 2)confidence preds. 148 | """ 149 | # decode arm box 150 | decode_arm = decode(arm_loc, priors=priors, variances=variances) 151 | # jaccard index 152 | overlaps = jaccard( 153 | truths, 154 | decode_arm 155 | ) 156 | # (Bipartite Matching) 157 | # [1,num_objects] best prior for each ground truth 158 | best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True) 159 | # [1,num_priors] best ground truth for each prior 160 | best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True) 161 | best_truth_idx.squeeze_(0) 162 | best_truth_overlap.squeeze_(0) 163 | best_prior_idx.squeeze_(1) 164 | best_prior_overlap.squeeze_(1) 165 | best_truth_overlap.index_fill_(0, best_prior_idx, 2) # ensure best prior 166 | # TODO refactor: index best_prior_idx with long tensor 167 | # ensure every gt matches with its prior of max overlap 168 | for j in range(best_prior_idx.size(0)): 169 | best_truth_idx[best_prior_idx[j]] = j 170 | matches = truths[best_truth_idx] # Shape: [num_priors,4] 171 | conf = labels[best_truth_idx] # Shape: [num_priors] 172 | conf[best_truth_overlap < threshold] = 0 # label as background 173 | loc = encode(matches, center_size(decode_arm), variances) 174 | loc_t[idx] = loc # [num_priors,4] encoded offsets to learn 175 | conf_t[idx] = conf # [num_priors] top class label for each prior 176 | 177 | def encode(matched, priors, variances): 178 | """Encode the variances from the priorbox layers into the ground truth boxes 179 | we have matched (based on jaccard overlap) with the prior boxes. 180 | Args: 181 | matched: (tensor) Coords of ground truth for each prior in point-form 182 | Shape: [num_priors, 4]. 183 | priors: (tensor) Prior boxes in center-offset form 184 | Shape: [num_priors,4]. 185 | variances: (list[float]) Variances of priorboxes 186 | Return: 187 | encoded boxes (tensor), Shape: [num_priors, 4] 188 | """ 189 | 190 | # dist b/t match center and prior's center 191 | g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2] 192 | # encode variance 193 | g_cxcy /= (variances[0] * priors[:, 2:]) 194 | # match wh / prior wh 195 | g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:] 196 | g_wh = torch.log(g_wh) / variances[1] 197 | # return target for smooth_l1_loss 198 | return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4] 199 | 200 | 201 | def encode_multi(matched, priors, offsets, variances): 202 | """Encode the variances from the priorbox layers into the ground truth boxes 203 | we have matched (based on jaccard overlap) with the prior boxes. 204 | Args: 205 | matched: (tensor) Coords of ground truth for each prior in point-form 206 | Shape: [num_priors, 4]. 207 | priors: (tensor) Prior boxes in center-offset form 208 | Shape: [num_priors,4]. 209 | variances: (list[float]) Variances of priorboxes 210 | Return: 211 | encoded boxes (tensor), Shape: [num_priors, 4] 212 | """ 213 | 214 | # dist b/t match center and prior's center 215 | g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2] - offsets[:,:2] 216 | # encode variance 217 | #g_cxcy /= (variances[0] * priors[:, 2:]) 218 | g_cxcy.div_(variances[0] * offsets[:, 2:]) 219 | # match wh / prior wh 220 | g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:] 221 | g_wh = torch.log(g_wh) / variances[1] 222 | # return target for smooth_l1_loss 223 | return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4] 224 | 225 | # Adapted from https://github.com/Hakuyume/chainer-ssd 226 | def decode(loc, priors, variances): 227 | """Decode locations from predictions using priors to undo 228 | the encoding we did for offset regression at train time. 229 | Args: 230 | loc (tensor): location predictions for loc layers, 231 | Shape: [num_priors,4] 232 | priors (tensor): Prior boxes in center-offset form. 233 | Shape: [num_priors,4]. 234 | variances: (list[float]) Variances of priorboxes 235 | Return: 236 | decoded bounding box predictions 237 | """ 238 | 239 | boxes = torch.cat(( 240 | priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], 241 | priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) 242 | boxes[:, :2] -= boxes[:, 2:] / 2 243 | boxes[:, 2:] += boxes[:, :2] 244 | return boxes 245 | 246 | def decode_multi(loc, priors, offsets, variances): 247 | """Decode locations from predictions using priors to undo 248 | the encoding we did for offset regression at train time. 249 | Args: 250 | loc (tensor): location predictions for loc layers, 251 | Shape: [num_priors,4] 252 | priors (tensor): Prior boxes in center-offset form. 253 | Shape: [num_priors,4]. 254 | variances: (list[float]) Variances of priorboxes 255 | Return: 256 | decoded bounding box predictions 257 | """ 258 | 259 | boxes = torch.cat(( 260 | priors[:, :2] + offsets[:,:2]+ loc[:, :2] * variances[0] * offsets[:, 2:], 261 | priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) 262 | boxes[:, :2] -= boxes[:, 2:] / 2 263 | boxes[:, 2:] += boxes[:, :2] 264 | return boxes 265 | 266 | def log_sum_exp(x): 267 | """Utility function for computing log_sum_exp while determining 268 | This will be used to determine unaveraged confidence loss across 269 | all examples in a batch. 270 | Args: 271 | x (Variable(tensor)): conf_preds from conf layers 272 | """ 273 | x_max = x.data.max() 274 | return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max 275 | 276 | 277 | # Original author: Francisco Massa: 278 | # https://github.com/fmassa/object-detection.torch 279 | # Ported to PyTorch by Max deGroot (02/01/2017) 280 | def nms(boxes, scores, overlap=0.5, top_k=200): 281 | """Apply non-maximum suppression at test time to avoid detecting too many 282 | overlapping bounding boxes for a given object. 283 | Args: 284 | boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. 285 | scores: (tensor) The class predscores for the img, Shape:[num_priors]. 286 | overlap: (float) The overlap thresh for suppressing unnecessary boxes. 287 | top_k: (int) The Maximum number of box preds to consider. 288 | Return: 289 | The indices of the kept boxes with respect to num_priors. 290 | """ 291 | 292 | keep = torch.Tensor(scores.size(0)).fill_(0).long() 293 | if boxes.numel() == 0: 294 | return keep 295 | x1 = boxes[:, 0] 296 | y1 = boxes[:, 1] 297 | x2 = boxes[:, 2] 298 | y2 = boxes[:, 3] 299 | area = torch.mul(x2 - x1, y2 - y1) 300 | v, idx = scores.sort(0) # sort in ascending order 301 | # I = I[v >= 0.01] 302 | idx = idx[-top_k:] # indices of the top-k largest vals 303 | xx1 = boxes.new() 304 | yy1 = boxes.new() 305 | xx2 = boxes.new() 306 | yy2 = boxes.new() 307 | w = boxes.new() 308 | h = boxes.new() 309 | 310 | # keep = torch.Tensor() 311 | count = 0 312 | while idx.numel() > 0: 313 | i = idx[-1] # index of current largest val 314 | # keep.append(i) 315 | keep[count] = i 316 | count += 1 317 | if idx.size(0) == 1: 318 | break 319 | idx = idx[:-1] # remove kept element from view 320 | # load bboxes of next highest vals 321 | torch.index_select(x1, 0, idx, out=xx1) 322 | torch.index_select(y1, 0, idx, out=yy1) 323 | torch.index_select(x2, 0, idx, out=xx2) 324 | torch.index_select(y2, 0, idx, out=yy2) 325 | # store element-wise max with next highest score 326 | xx1 = torch.clamp(xx1, min=x1[i]) 327 | yy1 = torch.clamp(yy1, min=y1[i]) 328 | xx2 = torch.clamp(xx2, max=x2[i]) 329 | yy2 = torch.clamp(yy2, max=y2[i]) 330 | w.resize_as_(xx2) 331 | h.resize_as_(yy2) 332 | w = xx2 - xx1 333 | h = yy2 - yy1 334 | # check sizes of xx1 and xx2.. after each iteration 335 | w = torch.clamp(w, min=0.0) 336 | h = torch.clamp(h, min=0.0) 337 | inter = w*h 338 | # IoU = i / (area(a) + area(b) - i) 339 | rem_areas = torch.index_select(area, 0, idx) # load remaining areas) 340 | union = (rem_areas - inter) + area[i] 341 | IoU = inter/union # store result in iou 342 | # keep only elements with an IoU <= overlap 343 | idx = idx[IoU.le(overlap)] 344 | return keep, count 345 | -------------------------------------------------------------------------------- /network/network_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Function 4 | from torch.autograd import Variable 5 | import torch.nn.init as init 6 | 7 | class L2Norm(nn.Module): 8 | def __init__(self, n_channels, gamma=None): 9 | super(L2Norm, self).__init__() 10 | self.n_channels = n_channels 11 | self.gamma = gamma 12 | self.eps = 1e-10 13 | self.weight = nn.Parameter(torch.Tensor(self.n_channels)) 14 | self.reset_parameters() 15 | 16 | def reset_parameters(self): 17 | init.constant(self.weight, self.gamma) 18 | 19 | def forward(self, x): 20 | norm = x.pow(x).sum(dim=1, keepdim=True).sqrt() + self.eps 21 | x /= norm 22 | out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x 23 | return out 24 | 25 | class EmbeddingNetwork(nn.Module): 26 | 27 | def __init__(self, in_channels): 28 | super(EmbeddingNetwork, self).__init__() 29 | self.conv1 = nn.Conv2d(in_channels, 512, kernel_size=1, stride=1, padding=0) 30 | self.conv2 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1) 31 | self.conv3 = nn.Conv2d(512, 2048, kernel_size=1, stride=1, padding=0) 32 | self.global_avg_pool = nn.AdaptiveAvgPool2d(1) 33 | 34 | def forward(self, x): 35 | x = self.conv1(x) 36 | x = self.conv2(x) 37 | x = self.conv3(x) 38 | x = self.global_avg_pool(x) 39 | x = x.squeeze(-1).squeeze(-1) 40 | 41 | return x -------------------------------------------------------------------------------- /network/prior_box.py: -------------------------------------------------------------------------------- 1 | from itertools import product as product 2 | from math import sqrt as sqrt 3 | 4 | import torch 5 | import numpy as np 6 | 7 | if torch.cuda.is_available(): 8 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 9 | 10 | 11 | class PriorBox(object): 12 | """Compute priorbox coordinates in center-offset form for each source 13 | feature map. 14 | Note: 15 | This 'layer' has changed between versions of the original SSD 16 | paper, so we include both versions, but note v2 is the most tested and most 17 | recent version of the paper. 18 | 19 | """ 20 | 21 | def __init__(self, cfg): 22 | super(PriorBox, self).__init__() 23 | self.image_size = cfg['min_dim'] 24 | # number of priors for feature map location (either 4 or 6) 25 | self.num_priors = len(cfg['aspect_ratios']) 26 | self.variance = cfg['variance'] or [0.1] 27 | self.feature_maps = cfg['feature_maps'] 28 | self.min_sizes = cfg['min_sizes'] 29 | self.max_sizes = cfg['max_sizes'] 30 | self.steps = cfg['steps'] 31 | self.aspect_ratios = cfg['aspect_ratios'] 32 | self.clip = cfg['clip'] 33 | for v in self.variance: 34 | if v <= 0: 35 | raise ValueError('Variances must be greater than 0') 36 | 37 | def forward(self): 38 | mean = [] 39 | for k, f in enumerate(self.feature_maps): 40 | for i, j in product(range(f), repeat=2): 41 | f_k = self.image_size / self.steps[k] 42 | cx = (j + 0.5) / f_k 43 | cy = (i + 0.5) / f_k 44 | 45 | s_k = self.min_sizes[k] / self.image_size 46 | mean += [cx, cy, s_k, s_k] 47 | 48 | # aspect_ratio: 1 49 | # rel size: sqrt(s_k * s_(k+1)) 50 | if self.max_sizes: 51 | s_k_prime = sqrt(s_k * (self.max_sizes[k] / self.image_size)) 52 | mean += [cx, cy, s_k_prime, s_k_prime] 53 | 54 | # rest of aspect ratios 55 | for ar in self.aspect_ratios[k]: 56 | mean += [cx, cy, s_k * sqrt(ar), s_k / sqrt(ar)] 57 | mean += [cx, cy, s_k / sqrt(ar), s_k * sqrt(ar)] 58 | 59 | # back to torch land 60 | output = np.array(mean).reshape(-1, 4) 61 | #output = torch.Tensor(mean).view(-1, 4) 62 | if self.clip: 63 | output = np.clip(output, 0., 1.) 64 | #output.clamp_(max=1, min=0) 65 | return output 66 | --------------------------------------------------------------------------------