├── rock
    ├── __init__.py
    ├── model
    │   ├── __init__.py
    │   ├── backbone.py
    │   ├── network.py
    │   ├── detection.py
    │   ├── auxiliary.py
    │   └── losses.py
    ├── ssd
    │   ├── __init__.py
    │   ├── prior_boxes.py
    │   └── encoder.py
    ├── utils
    │   ├── __init__.py
    │   ├── hide_print.py
    │   ├── tensorboard_model.py
    │   ├── load.py
    │   ├── show.py
    │   └── draw.py
    ├── datasets
    │   ├── __init__.py
    │   ├── image_folder.py
    │   ├── nyu_depth_v2.py
    │   └── transforms.py
    ├── training.py
    ├── detect.py
    ├── eval.py
    ├── trainer.py
    ├── prep.py
    └── run.py
├── docs
    ├── ssd_0332.png
    ├── ssd_0992.png
    ├── ssd_1302.png
    ├── rock_0170.png
    ├── rock_0332.png
    ├── rock_0992.png
    ├── rock_1150.png
    ├── rock_1302.png
    ├── rock_schema.png
    └── tensorboard_training_detections.png
├── requirements.txt
├── .gitignore
├── LICENSE
└── README.md


/rock/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rock/model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rock/ssd/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rock/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rock/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/ssd_0332.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vita-epfl/rock-pytorch/HEAD/docs/ssd_0332.png


--------------------------------------------------------------------------------
/docs/ssd_0992.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vita-epfl/rock-pytorch/HEAD/docs/ssd_0992.png


--------------------------------------------------------------------------------
/docs/ssd_1302.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vita-epfl/rock-pytorch/HEAD/docs/ssd_1302.png


--------------------------------------------------------------------------------
/docs/rock_0170.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vita-epfl/rock-pytorch/HEAD/docs/rock_0170.png


--------------------------------------------------------------------------------
/docs/rock_0332.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vita-epfl/rock-pytorch/HEAD/docs/rock_0332.png


--------------------------------------------------------------------------------
/docs/rock_0992.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vita-epfl/rock-pytorch/HEAD/docs/rock_0992.png


--------------------------------------------------------------------------------
/docs/rock_1150.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vita-epfl/rock-pytorch/HEAD/docs/rock_1150.png


--------------------------------------------------------------------------------
/docs/rock_1302.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vita-epfl/rock-pytorch/HEAD/docs/rock_1302.png


--------------------------------------------------------------------------------
/docs/rock_schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vita-epfl/rock-pytorch/HEAD/docs/rock_schema.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scipy
3 | matplotlib
4 | Pillow
5 | torch
6 | torchvision
7 | h5py
8 | tensorboard
9 | pycocotools


--------------------------------------------------------------------------------
/docs/tensorboard_training_detections.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vita-epfl/rock-pytorch/HEAD/docs/tensorboard_training_detections.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Data files in all folders
 2 | data/
 3 | # Editors
 4 | .vscode/
 5 | .idea/
 6 | .ipynb_checkpoints/
 7 | 
 8 | # MacOS
 9 | .DS_Store
10 | 
11 | # Byte-compiled / optimized / DLL files
12 | _pycache_/
13 | __pycache__/
14 | 


--------------------------------------------------------------------------------
/rock/utils/hide_print.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | 
 5 | class HiddenPrints(object):
 6 |     """ Hides print called inside this class
 7 |     Used to suppress COCOeval printing when not verbose
 8 | 
 9 |     From: https://stackoverflow.com/questions/8391411/suppress-calls-to-print-python
10 |     """
11 |     def __enter__(self):
12 |         self._original_stdout = sys.stdout
13 |         sys.stdout = open(os.devnull, 'w')
14 | 
15 |     def __exit__(self, exc_type, exc_val, exc_tb):
16 |         sys.stdout.close()
17 |         sys.stdout = self._original_stdout
18 | 


--------------------------------------------------------------------------------
/rock/utils/tensorboard_model.py:
--------------------------------------------------------------------------------
 1 | import torch.utils.data
 2 | from torch.utils.tensorboard import SummaryWriter
 3 | 
 4 | import rock.ssd.prior_boxes
 5 | import rock.ssd.encoder
 6 | import rock.datasets.image_folder
 7 | import rock.model.network
 8 | 
 9 | 
10 | def add_graph_to_tb(writer_path: str = 'data/runs/rock_model'):
11 | 
12 |     print("Adding graph to Tensorboard")
13 |     model = rock.model.network.rock_network()
14 |     dataset = rock.datasets.image_folder.ImageFolder('data/detection/images')
15 |     dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False, num_workers=2)
16 |     images, _ = next(iter(dataloader))
17 | 
18 |     model = model.cuda()
19 |     images = images.cuda()
20 | 
21 |     writer = SummaryWriter(writer_path)
22 |     writer.add_graph(model, images)
23 |     writer.close()
24 |     print("Done adding graph!")
25 | 


--------------------------------------------------------------------------------
/rock/utils/load.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Optional, Any, Tuple
 3 | 
 4 | import torch
 5 | import torch.nn
 6 | 
 7 | 
 8 | def load_from_checkpoint(checkpoint_path: str,
 9 |                          model: torch.nn.Module,
10 |                          optimizer: Optional[Any] = None,  # from torch.optim
11 |                          scheduler: Optional[Any] = None,  # from torch.optim
12 |                          verbose: bool = True) -> Tuple[int, int]:
13 |     """Loads model from checkpoint, loads optimizer and scheduler too if not None, and returns epoch and iteration
14 |     of the checkpoints
15 |     """
16 |     if not os.path.exists(checkpoint_path):
17 |         raise ("File doesn't exist {}".format(checkpoint_path))
18 | 
19 |     checkpoint = torch.load(checkpoint_path)
20 | 
21 |     model.load_state_dict(checkpoint['model'])
22 | 
23 |     if optimizer:
24 |         optimizer.load_state_dict(checkpoint['optimizer'])
25 | 
26 |     if scheduler:
27 |         scheduler.load_state_dict(checkpoint['scheduler'])
28 | 
29 |     epoch = checkpoint['epoch']
30 |     iteration = checkpoint['iteration']
31 | 
32 |     if verbose:
33 |         print("Loaded model from checkpoint")
34 | 
35 |     return epoch, iteration
36 | 


--------------------------------------------------------------------------------
/rock/model/backbone.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchvision
 3 | from torch import nn
 4 | from torchvision.models import ResNet
 5 | # noinspection PyProtectedMember
 6 | from torchvision.models.resnet import Bottleneck
 7 | 
 8 | 
 9 | class ResNet50(ResNet):
10 |     """Custom ResNet50 used as the network backbone
11 | 
12 |     Modified version of the base ResNet50 from torchvision,
13 |     found at: https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
14 | 
15 |     This ResNet replace stride with a dilation on layer 4 for a larger feature map, and
16 |     removes the average pooling and fully-connected part of the network.
17 |     During training, BatchNorm is frozen.
18 |     """
19 | 
20 |     def __init__(self) -> None:
21 |         # Replace stride with dilation on layer 4
22 |         super().__init__(Bottleneck, [3, 4, 6, 3], replace_stride_with_dilation=[False, False, True])
23 |         self.training = True
24 | 
25 |         # Pretrain the network
26 |         self.load_state_dict(torchvision.models.resnet50(pretrained=True).state_dict())
27 | 
28 |         # Replace last 2 (unused) layers with identity
29 |         self.avgpool = nn.Identity()
30 |         self.fc = nn.Identity()
31 | 
32 |     def _forward_impl(self, x: torch.Tensor) -> torch.Tensor:
33 |         """ Forward implementation of ResNet50 where avg_pool, flatten and fc are removed
34 | 
35 |         Shape:
36 |             - X: :math:`(N, 3, H_in, W_in)` where :math:`(H_in, W_in)` are the image height and width
37 |             - Output: :math:`(N, C_out, H_out, W_out)` where :math:`C_out` is the number of output channels
38 |         |
39 |         For ROCK implementation on the NYUv2 dataset:
40 |             - :math:`(H_in, W_in)  = (480, 640)`
41 |             - :math:`(C_out, H_out, W_out) = (2048, 30, 40)`
42 |         """
43 |         x = self.conv1(x)
44 |         x = self.bn1(x)
45 |         x = self.relu(x)
46 |         x = self.maxpool(x)
47 | 
48 |         x = self.layer1(x)
49 |         x = self.layer2(x)
50 |         x = self.layer3(x)
51 |         x = self.layer4(x)
52 |         return x
53 | 
54 |     def freeze_bn(self) -> None:
55 |         """Freezes batch norm layers
56 |         """
57 | 
58 |         for module in self.modules():
59 |             if type(module) == nn.modules.batchnorm.BatchNorm2d:
60 |                 module.eval()
61 | 
62 |     def train(self, mode=True):
63 |         r"""Override of nn.module method to freeze batchnorm in all cases
64 | 
65 |         Sets the module in training mode.
66 | 
67 |         This has any effect only on certain modules. See documentations of
68 |         particular modules for details of their behaviors in training/evaluation
69 |         mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`,
70 |         etc.
71 | 
72 |         Args:
73 |             mode (bool): whether to set training mode (``True``) or evaluation
74 |                          mode (``False``). Default: ``True``.
75 | 
76 |         Returns:
77 |             Module: self
78 |         """
79 |         self.training = mode
80 |         for module in self.children():
81 |             module.train(mode)
82 | 
83 |         self.freeze_bn()
84 | 
85 |         return self
86 | 
87 | 


--------------------------------------------------------------------------------
/rock/datasets/image_folder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | from pathlib import Path
 4 | from typing import Tuple, Any
 5 | 
 6 | import numpy as np
 7 | import torch
 8 | from PIL import Image
 9 | from torch.utils import data as data
10 | from torchvision import transforms
11 | 
12 | 
13 | def extract_image_and_save_to_folder(data_folder_path: str,
14 |                                      save_folder_path: str,
15 |                                      verbose: bool = True) -> None:
16 |     """ Extracts images from a folder of .pkl files and saves it to another folder
17 |     """
18 |     def pickle_load(fp: str) -> Any:
19 |         """ Loads pickled data from a given path
20 |         """
21 |         with open(fp, 'rb') as handle:
22 |             loaded_data = pickle.load(handle)
23 |         return loaded_data
24 | 
25 |     Path(save_folder_path).mkdir(parents=True, exist_ok=True)
26 | 
27 |     if verbose:
28 |         print("Extracting images...")
29 | 
30 |     files = sorted([os.path.join(data_folder_path, file) for file in os.listdir(data_folder_path) if file.endswith(".pkl")])
31 | 
32 |     image_count = 0
33 |     total_images = len(files)
34 | 
35 |     for filepath in files:
36 |         filename = os.path.basename(filepath).replace('.pkl', '.png')
37 |         save_path = os.path.join(save_folder_path, filename)
38 | 
39 |         d = pickle_load(filepath)
40 |         img = Image.fromarray(d['img'])
41 |         img.save(fp=save_path)
42 | 
43 |         image_count += 1
44 |         if verbose:
45 |             print("{}/{} images extracted".format(image_count, total_images), end='\r')
46 | 
47 |     if verbose:
48 |         print()
49 |         print("Done!")
50 | 
51 | 
52 | class ImageFolder(data.Dataset):
53 |     """Datareader for a folder of images
54 |     """
55 | 
56 |     def __init__(self, path: str) -> None:
57 |         """
58 |         Args:
59 |             path: path to the folder containing images (either PNG or JPG images)
60 |         """
61 |         self.files = sorted([os.path.join(path, file) for file in os.listdir(path) if file.endswith(".jpg") or file.endswith(".png")])
62 | 
63 |         self.size = (480, 640)
64 |         self.aspect_ratio = self.size[1] / self.size[0]
65 | 
66 |         self.img_trans = transforms.Compose([
67 |             transforms.Resize(self.size),
68 |             transforms.ToTensor(),
69 |             transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
70 |         ])
71 | 
72 |     def __len__(self) -> int:
73 |         return len(self.files)
74 | 
75 |     def __getitem__(self, idx: int) -> Tuple[torch.Tensor, str]:
76 |         """ Obtains the image and filename at a given index
77 |         """
78 |         # Get file name
79 |         filename = os.path.basename(self.files[idx])
80 | 
81 |         # Load image and remove alpha color channel if existing
82 |         img = Image.open(self.files[idx]).convert("RGB")
83 | 
84 |         # Find largest crop
85 |         if self.aspect_ratio < 1:
86 |             crop_w = np.min([img.height * self.aspect_ratio, img.width])
87 |             crop_h = crop_w / self.aspect_ratio
88 |         else:
89 |             crop_h = np.min([img.width / self.aspect_ratio, img.height])
90 |             crop_w = crop_h * self.aspect_ratio
91 | 
92 |         img = transforms.CenterCrop(size=(crop_h, crop_w))(img)
93 |         img = self.img_trans(img)
94 | 
95 |         return img, filename
96 | 


--------------------------------------------------------------------------------
/rock/model/network.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Tuple
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | import rock.model.auxiliary
 7 | import rock.model.backbone
 8 | import rock.model.detection
 9 | 
10 | 
11 | class Network(nn.Module):
12 |     """Network class which combines the backbone, the auxiliary tasks (optional), and the detection block
13 | 
14 |     Can be used to implement the ROCK network architecture or a baseline Single Shot Detector
15 |     """
16 | 
17 |     def __init__(self,
18 |                  backbone: torch.nn.Module,
19 |                  detection: torch.nn.Module,
20 |                  auxiliary: Optional[torch.nn.Module] = None) -> None:
21 |         """
22 |         Args:
23 |             backbone: backbone used to obtain the base feature map
24 |             detection: detection layer of Network
25 |             auxiliary: auxiliary block used for MTL
26 |         """
27 |         super().__init__()
28 | 
29 |         self.feature_extractor = backbone
30 |         self.detection = detection
31 |         self.auxiliary = auxiliary
32 | 
33 |     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, ...]:
34 |         """ Forward implementation of model
35 | 
36 |         Gives a tuple containing the location and confidence tensors, as well as the scene, depth and normals tensors
37 |         if auxiliary tasks are added
38 | 
39 |         Shape:
40 |             - X: :math:`(N, 3, H_in, W_in)` where :math:`(H_in, W_in)` are the image height and width
41 |             - Locs: :math:`(N, 4, num_priors)` where :math:`num_priors` is the number of priors
42 |             - Confs: :math:`(N, num_labels, num_priors)` where :math:`num_labels` is the number of object labels
43 |             - Scene pred: :math:`(N, num_scenes)`
44 |             - Depth pred: :math:`(N, 1, H_featuremap, W_featuremap)`
45 |             - Normals pred: :math:`(N, 3, H_featuremap, W_featuremap)`
46 |         |
47 |         For ROCK implementation on the NYUv2 dataset:
48 |             - :math:`(H_in, W_in)  = (480, 640)`
49 |             - :math:`num_priors = 7228`
50 |             - :math:`num_labels = 20`
51 |             - :math:`num_scenes = 27`
52 |             - :math:`(H_featuremap, W_featuremap) = (30, 40)`
53 |         """
54 |         x = self.feature_extractor(x)
55 | 
56 |         aux_out = []
57 |         if self.auxiliary:
58 |             # For ROCK without fusion, replace x by _
59 |             x, scene, depth, normals = self.auxiliary(x)
60 |             aux_out.extend([scene, depth, normals])
61 | 
62 |         locs, confs = self.detection(x)
63 | 
64 |         return (locs, confs, *aux_out)
65 | 
66 | 
67 | def rock_network(aux_tasks: Tuple[str] = ('scene', 'depth', 'normals')) -> torch.nn.Module:
68 |     """
69 |     Creates a model similar to the one described in the paper
70 |     "Revisiting Multi-Task Learning with ROCK: a Deep Residual Auxiliary Block for Visual Detection"
71 |     """
72 |     backbone = rock.model.backbone.ResNet50()
73 |     detection = rock.model.detection.Detection()
74 |     auxiliary = rock.model.auxiliary.ROCK(aux_tasks)
75 |     model = Network(backbone=backbone, detection=detection, auxiliary=auxiliary)
76 |     return model
77 | 
78 | 
79 | def baseline_ssd() -> torch.nn.Module:
80 |     """
81 |     Creates a network similar to the one described in the paper
82 |     "Revisiting Multi-Task Learning with ROCK: a DeepResidual Auxiliary Block for Visual Detection",
83 |     but with no auxiliary block (ROCK). This network is based on the SSD (Single Shot Detector) architecture
84 |     """
85 |     backbone = rock.model.backbone.ResNet50()
86 |     detection = rock.model.detection.Detection()
87 |     model = Network(backbone=backbone, detection=detection, auxiliary=None)
88 |     return model
89 | 


--------------------------------------------------------------------------------
/rock/utils/show.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from pathlib import Path
  3 | from typing import Dict, List
  4 | 
  5 | import torch
  6 | import torchvision
  7 | from PIL import Image
  8 | from torch.utils.data import DataLoader
  9 | 
 10 | import rock.ssd.encoder
 11 | from rock.utils.draw import draw_transforms, inv_norm, draw_predictions
 12 | 
 13 | 
 14 | def predict_grid(model: torch.nn.Module,
 15 |                  dataset: torch.utils.data.Dataset,
 16 |                  encoder: rock.ssd.encoder.Encoder,
 17 |                  label_map: Dict[int, str],
 18 |                  device: torch.device = torch.device("cuda"),
 19 |                  conf_threshold: float = 0.0) -> torch.Tensor:
 20 |     """ Obtains grid of batch images with ground truth boxes and the model's detections
 21 | 
 22 |     Args:
 23 |         model: network
 24 |         dataset: dataset on which to output grid
 25 |         encoder: encoder use to encode / decode the network's output
 26 |         label_map: dictionary mapping class id -> class name
 27 |         device: device to use to obtain grids (default: cuda)
 28 |         conf_threshold: the confidence threshold to show detections (default: 0.0)
 29 | 
 30 |     Returns:
 31 |         grid of images batch
 32 | 
 33 |     """
 34 |     batch_size = 8
 35 |     dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=2, drop_last=True)
 36 |     dataiter = iter(dataloader)
 37 | 
 38 |     sample = next(dataiter)
 39 | 
 40 |     return _batch_image(sample, model, encoder, label_map, device, conf_threshold)
 41 | 
 42 | 
 43 | def all_predict_grids(model: torch.nn.Module,
 44 |                       dataset: torch.utils.data.Dataset,
 45 |                       encoder: rock.ssd.encoder.Encoder,
 46 |                       label_map: Dict[int, str],
 47 |                       device: torch.device = torch.device("cuda"),
 48 |                       conf_threshold: float = 0.0) -> List[torch.Tensor]:
 49 |     """ Obtains all grids of batch images with ground truth boxes and the model's detections
 50 | 
 51 |     Args:
 52 |         model: network
 53 |         dataset: dataset on which to output grid
 54 |         encoder: encoder use to encode / decode the network's output
 55 |         label_map: dictionary mapping class id -> class name
 56 |         device: device to use to obtain grids (default: cuda)
 57 |         conf_threshold: the confidence threshold to show detections (default: 0.0)
 58 | 
 59 |     Returns:
 60 |         all grids of images batch
 61 |     """
 62 |     batch_size = 8
 63 |     dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=2, drop_last=True)
 64 | 
 65 |     grids = []
 66 |     for i, sample in enumerate(dataloader):
 67 |         grids.append(_batch_image(sample, model, encoder, label_map, device, conf_threshold))
 68 | 
 69 |     return grids
 70 | 
 71 | 
 72 | def _batch_image(sample: Dict[str, torch.Tensor],
 73 |                  model: torch.nn.Module,
 74 |                  encoder: rock.ssd.encoder.Encoder,
 75 |                  label_map: Dict[int, str],
 76 |                  device: torch.device = torch.device("cuda"),
 77 |                  conf_threshold: float = 0.0) -> torch.Tensor:
 78 | 
 79 |     # Put model in eval mode
 80 |     batch_size = 8
 81 |     model.eval()
 82 |     model.to(device)
 83 | 
 84 |     imgs, bboxes, labels = sample['img'], sample['bboxes'], sample['labels']
 85 | 
 86 |     i = 0
 87 |     save_folder = 'data/tensorboard_images'
 88 |     Path(save_folder).mkdir(parents=True, exist_ok=True)
 89 | 
 90 |     save_paths = []
 91 |     for img, bbox, label in zip(imgs, bboxes, labels):
 92 |         save_path = os.path.join(save_folder, 'gt_{}.png'.format(i))
 93 |         save_paths.append(save_path)
 94 |         draw_transforms(inv_norm(img), bbox, label, label_map=label_map, save_path=save_path, show=False)
 95 |         i += 1
 96 | 
 97 |     with torch.no_grad():
 98 |         inp = imgs.clone().to(device)
 99 |         ploc, plabel, *aux_out = model(inp)
100 | 
101 |         for i in range(batch_size):
102 |             save_path = os.path.join(save_folder, 'pt_{}.png'.format(i))
103 |             save_paths.append(save_path)
104 |             draw_predictions(inv_norm(imgs[i]), encoder, ploc, plabel, i, label_map=label_map, save_path=save_path, show=False, conf_threshold=conf_threshold)
105 | 
106 |     # Put model back in training mode
107 |     model.train()
108 | 
109 |     images = []
110 | 
111 |     for path in save_paths:
112 |         img = Image.open(path)
113 |         img = img.resize((320, 240))
114 | 
115 |         t = torchvision.transforms.ToTensor()(img)
116 |         images.append(t)
117 | 
118 |     img_grid = torchvision.utils.make_grid(images, padding=5)
119 | 
120 |     return img_grid
121 | 


--------------------------------------------------------------------------------
/rock/ssd/prior_boxes.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | from math import sqrt
  3 | from typing import Tuple, List
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | 
  8 | 
  9 | class PriorBoxes(object):
 10 |     """ Prior boxes of the network
 11 | 
 12 |     Modified from https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Detection/SSD for
 13 |     more modularity and support for rectangular images
 14 |     """
 15 | 
 16 |     def __init__(self, fig_size: Tuple[int, int],
 17 |                  feat_size: List[Tuple[int, int]],
 18 |                  steps: Tuple[List[int], List[int]],
 19 |                  sk: List[float],
 20 |                  aspect_ratios: List[List[int]],
 21 |                  variance_xy: float = 0.1,
 22 |                  variance_wh: float = 0.2) -> None:
 23 |         """
 24 | 
 25 |         Args:
 26 |             fig_size: size of input image
 27 |             feat_size: list of sizes of the feature maps
 28 |             steps: scale change for each feature map
 29 |             sk: scale of the prior boxes for each feature map
 30 |             aspect_ratios: aspect ratio of the prior boxes for each feature map
 31 |             variance_xy: default value 0.1
 32 |             variance_wh: default value 0.2
 33 |         """
 34 | 
 35 |         self.feat_size = feat_size
 36 |         self.fig_height, self.fig_width = fig_size
 37 | 
 38 |         # More info: https://leimao.github.io/blog/Bounding-Box-Encoding-Decoding/
 39 |         self.variance_xy_ = variance_xy
 40 |         self.variance_wh_ = variance_wh
 41 | 
 42 |         self.steps_height, self.steps_width = steps
 43 |         self.sk = sk
 44 | 
 45 |         fk_height = self.fig_height / np.array(self.steps_height)
 46 |         fk_width = self.fig_width / np.array(self.steps_width)
 47 |         self.aspect_ratios = aspect_ratios
 48 | 
 49 |         self.prior_boxes = []
 50 | 
 51 |         # Incorporate different features sizes for width and height
 52 |         for idx, (sfeat_height, sfeat_width) in enumerate(self.feat_size):
 53 | 
 54 |             sk1 = self.sk[idx]
 55 |             sk2 = self.sk[idx + 1]
 56 |             sk3 = sqrt(sk1 * sk2)
 57 |             all_sizes = [(sk1, sk1), (sk3, sk3)]
 58 | 
 59 |             for alpha in aspect_ratios[idx]:
 60 |                 w, h = sk1 * sqrt(alpha), sk1 / sqrt(alpha)
 61 |                 all_sizes.append((w, h))
 62 |                 all_sizes.append((h, w))
 63 |             for w, h in all_sizes:
 64 | 
 65 |                 # Iterate over product of width and height
 66 |                 for i, j in itertools.product(range(sfeat_height), range(sfeat_width)):
 67 |                     cx, cy = (j + 0.5) / fk_width[idx], (i + 0.5) / fk_height[idx]
 68 |                     self.prior_boxes.append((cx, cy, w, h))
 69 | 
 70 |         self.pboxes = torch.tensor(self.prior_boxes, dtype=torch.float)
 71 |         self.pboxes.clamp_(min=0, max=1)
 72 | 
 73 |         # For IoU calculation
 74 |         self.pboxes_ltrb = self.pboxes.clone()
 75 |         self.pboxes_ltrb[:, 0] = self.pboxes[:, 0] - 0.5 * self.pboxes[:, 2]
 76 |         self.pboxes_ltrb[:, 1] = self.pboxes[:, 1] - 0.5 * self.pboxes[:, 3]
 77 |         self.pboxes_ltrb[:, 2] = self.pboxes[:, 0] + 0.5 * self.pboxes[:, 2]
 78 |         self.pboxes_ltrb[:, 3] = self.pboxes[:, 1] + 0.5 * self.pboxes[:, 3]
 79 | 
 80 |     @property
 81 |     def variance_xy(self) -> float:
 82 |         """  More info: https://leimao.github.io/blog/Bounding-Box-Encoding-Decoding/
 83 |         """
 84 |         return self.variance_xy_
 85 | 
 86 |     @property
 87 |     def variance_wh(self) -> float:
 88 |         """  More info: https://leimao.github.io/blog/Bounding-Box-Encoding-Decoding/
 89 |         """
 90 |         return self.variance_wh_
 91 | 
 92 |     def scale_change(self) -> float:
 93 |         """ Scale between input image and feature map
 94 |         """
 95 |         return sqrt(self.steps_height[0] * self.steps_width[0])
 96 | 
 97 |     def __call__(self, order: str = "ltrb") -> torch.Tensor:
 98 |         if order == "ltrb":
 99 |             return self.pboxes_ltrb
100 |         if order == "xywh":
101 |             return self.pboxes
102 | 
103 | 
104 | def pboxes_rock() -> PriorBoxes:
105 |     """Prior boxes for the NYUv2 dataset
106 | 
107 |     Returns:
108 |         prior boxes for the given specifications
109 |     """
110 |     figsize = (480, 640)
111 |     feat_size = [(30, 40), (15, 20), (8, 10), (4, 5), (2, 3), (1, 1)]
112 | 
113 |     # steps is [figsize_h/(feat_size[i][h]), figsize_w/(feat_size[i][w]))]
114 |     steps_h = [16, 32, 60, 120, 240, 480]
115 |     steps_w = [16, 32, 64, 128, 213, 640]
116 |     steps = (steps_h, steps_w)
117 | 
118 |     # 6 layers: conv4_3 = 0.07, smin=0.15, smax=1.05
119 |     sk = [0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05]
120 | 
121 |     aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
122 |     pboxes = PriorBoxes(figsize, feat_size, steps, sk, aspect_ratios)
123 |     return pboxes
124 | 


--------------------------------------------------------------------------------
/rock/training.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Dict, Any, Tuple
  2 | 
  3 | import torch
  4 | import torch.optim
  5 | import torch.utils.data.dataloader
  6 | import torch.utils.tensorboard.writer
  7 | 
  8 | 
  9 | def train_loop(model: torch.nn.Module,
 10 |                loss_func: torch.nn.Module,
 11 |                dataloader: torch.utils.data.dataloader.DataLoader,
 12 |                epoch: int,
 13 |                iteration: int,
 14 |                optimizer: Any,  # from torch.optim (i.e. Adam, SGD, ...)
 15 |                scheduler: Any,  # from torch.optim.lr_scheduler (i.e. MultiStepLR)
 16 |                writer: Optional[torch.utils.tensorboard.writer.SummaryWriter] = None,
 17 |                device: torch.device = torch.device("cuda")) -> Tuple[float, int]:
 18 |     """ Training loop
 19 |     Implements the forward + backward pass for each sample and obtains the loss on the training data
 20 |     """
 21 |     model.train()
 22 | 
 23 |     train_running_loss = 0.0
 24 |     train_loss_dict = None if writer is None else loss_dict()
 25 | 
 26 |     for sample in dataloader:
 27 |         optimizer.zero_grad()
 28 | 
 29 |         loss_sample = forward_pass(model, sample, device)
 30 | 
 31 |         loss = loss_func(loss_sample, train_loss_dict)
 32 |         train_running_loss += loss.item()
 33 |         loss.backward()
 34 | 
 35 |         # If we want gradient clipping:
 36 |         # clipping_value = 1  # arbitrary value of your choosing
 37 |         # torch.nn.utils.clip_grad_value_(model.parameters(), clipping_value)
 38 |         optimizer.step()
 39 | 
 40 |         scheduler.step()
 41 |         iteration += 1
 42 | 
 43 |     train_loss = train_running_loss / len(dataloader)
 44 | 
 45 |     if writer is not None:
 46 |         writer.add_scalar('train loss', train_loss, epoch)
 47 | 
 48 |         for key, value in train_loss_dict.items():
 49 |             writer.add_scalar(key + '_train', value / len(dataloader), epoch)
 50 | 
 51 |     return train_loss, iteration
 52 | 
 53 | 
 54 | def val_loop(model: torch.nn.Module,
 55 |              loss_func: torch.nn.Module,
 56 |              dataloader: torch.utils.data.dataloader.DataLoader,
 57 |              epoch: int,
 58 |              writer: Optional[torch.utils.tensorboard.writer.SummaryWriter] = None,
 59 |              device: torch.device = torch.device("cuda"),
 60 |              dataset_type: str = 'val') -> float:
 61 |     """ Validation loop (no backprop)
 62 |     Obtains the loss on the validation (or test) data
 63 |     """
 64 |     model.eval()
 65 | 
 66 |     val_running_loss = 0.0
 67 |     val_loss_dict = loss_dict()
 68 | 
 69 |     # Get val data loss
 70 |     for sample in dataloader:
 71 |         with torch.no_grad():
 72 |             loss_sample = forward_pass(model, sample, device)
 73 | 
 74 |             loss = loss_func(loss_sample, val_loss_dict)
 75 | 
 76 |             val_running_loss += loss.item()
 77 | 
 78 |     val_loss = val_running_loss / len(dataloader)
 79 | 
 80 |     if writer is not None:
 81 |         writer.add_scalar('val loss'.format(dataset_type), val_loss, epoch)
 82 | 
 83 |         for key, value in val_loss_dict.items():
 84 |             writer.add_scalar('{}_{}'.format(key, dataset_type), value / len(dataloader), epoch)
 85 | 
 86 |     return val_loss
 87 | 
 88 | 
 89 | def forward_pass(model: torch.nn.Module,
 90 |                  sample: Dict[str, torch.Tensor],
 91 |                  device: torch.device) -> Dict[str, torch.Tensor]:
 92 |     """ Forward pass of the network for a given sample
 93 |     """
 94 |     img = sample['img'].to(device)
 95 |     bboxes = sample['bboxes'].to(device)
 96 |     labels = sample['labels'].to(device)
 97 |     bboxes_mask = sample['bboxes_mask'].to(device)
 98 | 
 99 |     # Forward pass
100 |     ploc, plabel, *aux_out = model(img)
101 | 
102 |     # Create the loss sample
103 |     loss_sample = {}
104 |     loss_sample['ploc'] = ploc
105 |     loss_sample['plabel'] = plabel
106 |     loss_sample['bboxes'] = bboxes
107 |     loss_sample['labels'] = labels
108 |     loss_sample['bboxes_mask'] = bboxes_mask
109 | 
110 |     # Check if we should include auxiliary
111 |     if sample['auxiliary'].all() and aux_out is not None:
112 |         loss_sample['scene_gt'] = sample['scene_id'].to(device)
113 |         loss_sample['scene_pred'] = aux_out[0]
114 | 
115 |         loss_sample['depth_gt'] = sample['depth'].to(device)
116 |         loss_sample['depth_pred'] = aux_out[1]
117 | 
118 |         loss_sample['normals_gt'] = sample['normals'].to(device)
119 |         loss_sample['normals_mask'] = sample['normals_mask'].to(device)
120 |         loss_sample['normals_pred'] = aux_out[2]
121 | 
122 |     return loss_sample
123 | 
124 | 
125 | def loss_dict() -> Dict[str, float]:
126 |     """ Initialize loss dictionary for network sub-tasks
127 |     """
128 | 
129 |     # Add z in front of each key name so they appear on the bottom (and together) in tensorboard
130 |     d = {'z_normals_loss': 0.0, 'z_scene_loss': 0.0, 'z_depth_loss': 0.0, 'z_conf_loss': 0.0, 'z_loc_loss': 0.0}
131 |     return d
132 | 


--------------------------------------------------------------------------------
/rock/datasets/nyu_depth_v2.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | from typing import List, Dict, Any, Tuple
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from PIL import Image
  8 | from torch.utils import data as data
  9 | 
 10 | import rock.datasets.transforms
 11 | 
 12 | 
 13 | class NYUv2Detection(data.Dataset):
 14 |     """Datareader for the NYUv2 Dataset
 15 |     """
 16 | 
 17 |     def __init__(self,
 18 |                  path: str,
 19 |                  transform: rock.datasets.transforms.Transformer,
 20 |                  auxiliary: bool = True) -> None:
 21 |         """
 22 |         Args:
 23 |             path: path to the folder containing the pre-processed dataset
 24 |             transform: input transformer
 25 |             auxiliary: boolean that indicates whether to include auxiliary tasks or not
 26 |         """
 27 |         self.files = sorted([os.path.join(path, file) for file in os.listdir(path) if file.endswith(".pkl")])
 28 | 
 29 |         datum = self.pickle_load(self.files[0])
 30 | 
 31 |         self.img_size = (datum['img'].shape[0], datum['img'].shape[1])
 32 | 
 33 |         self.classes = datum['rock_classes']
 34 |         self.label_info = {}
 35 |         for i, name in enumerate(self.classes):
 36 |             self.label_info[i] = name
 37 | 
 38 |         self.transform = transform
 39 |         self.auxiliary = auxiliary
 40 | 
 41 |     @property
 42 |     def num_labels(self) -> int:
 43 |         """ Number of labels in the dataset
 44 |         """
 45 |         return len(self.label_info)
 46 | 
 47 |     @property
 48 |     def categories(self) -> List[str]:
 49 |         """ Name of all labels (object categories)
 50 |         """
 51 |         return self.classes
 52 | 
 53 |     @property
 54 |     def label_map(self) -> Dict[int, str]:
 55 |         """ Map from label num to name of label
 56 |         """
 57 |         return self.label_info
 58 | 
 59 |     def pickle_load(self, filepath: str) -> Any:
 60 |         """ Loads pickled data from a given path
 61 |         """
 62 |         with open(filepath, 'rb') as handle:
 63 |             loaded_data = pickle.load(handle)
 64 |         return loaded_data
 65 | 
 66 |     def pickle_save(self, filepath: str):
 67 |         """ Saves data to a given path as a .pkl file
 68 |         """
 69 |         with open(filepath, 'wb') as handle:
 70 |             pickle.dump(self, handle)
 71 | 
 72 |     def get_eval(self, idx: int) -> Tuple[List[Any], List[int]]:
 73 |         """ Gets the bounding box info of a sample for evaluation
 74 | 
 75 |         Args:
 76 |             idx: image index
 77 | 
 78 |         Returns:
 79 |             tuple containing:
 80 |                 list of bounding box location and list of bounding box labels
 81 | 
 82 |         """
 83 |         d = self.pickle_load(self.files[idx])
 84 | 
 85 |         bbox_sizes = []
 86 |         bbox_labels = []
 87 | 
 88 |         for elem in (d['bboxes']):
 89 |             l, t, r, b = elem['box']
 90 |             w, h = r - l, b - t
 91 | 
 92 |             bbox_size = (l, t, w, h)
 93 |             bbox_label = elem['labels']
 94 |             bbox_sizes.append(bbox_size)
 95 |             bbox_labels.append(bbox_label)
 96 | 
 97 |         return bbox_sizes, bbox_labels
 98 | 
 99 |     def __len__(self) -> int:
100 |         return len(self.files)
101 | 
102 |     def __getitem__(self, idx: int) -> Dict[str, Any]:
103 |         """ Obtains a sample from a given index of the dataset
104 | 
105 |         Output types of dict items are:
106 |             - `img` (torch.Tensor) Shape: :math:`(3, H_img, W_img)`
107 |             - `img_id` (int)
108 |             - `size` (Tuple[int, int])
109 |             - `bboxes` (torch.Tensor) Shape: :math:`(4, num_priors)`
110 |             - `labels` (torch.Tensor) Shape: :math:`(num_priors)`
111 |             - `bboxes_mask` (torch.BoolTensor) Shape: :math:`(num_priors)`
112 |             - `scene_id` (int)
113 |             - `depth` (torch.Tensor) Shape: :math:`(1, H_featuremap, W_featuremap)`
114 |             - `normals` (torch.Tensor) Shape: :math:`(3, H_featuremap, W_featuremap)`
115 |             - `normals_mask` (torch.BoolTensor) Shape: :math:`(1, H_featuremap, W_featuremap)`
116 |             - `auxiliary` (bool)
117 |         """
118 | 
119 |         # Load sample files
120 |         d = self.pickle_load(self.files[idx])
121 | 
122 |         img = Image.fromarray(d['img'])
123 |         img_id = idx
124 | 
125 |         htot, wtot = self.img_size
126 | 
127 |         bboxes, labels = [], []
128 |         for elem in (d['bboxes']):
129 |             l, t, r, b = elem['box']
130 |             bbox_label = elem['labels']
131 |             bbox_size = (l / wtot, t / htot, r / wtot, b / htot)
132 |             bboxes.append(bbox_size)
133 |             labels.append(bbox_label)
134 | 
135 |         bboxes = torch.tensor(bboxes, dtype=torch.float)
136 |         labels = torch.tensor(labels)
137 |         mask = None
138 | 
139 |         # Auxiliary tasks part
140 |         if self.auxiliary:
141 |             scene_id = d['scene_id']
142 |             depth = d['depth']
143 |             max_depth = depth.max()
144 | 
145 |             depth = np.uint8(255 * depth / max_depth)
146 |             depth = Image.fromarray(depth, 'L')
147 | 
148 |             normals = Image.fromarray(d['normals'])
149 |             normals_mask = Image.fromarray(d['mask'])
150 |         else:
151 |             scene_id, depth, max_depth, normals, normals_mask = 0, 0, 0, 0, 0
152 | 
153 |         sample = {'img': img, 'img_id': img_id, 'size': (htot, wtot),
154 |                   'bboxes': bboxes, 'labels': labels, 'bboxes_mask': mask,
155 |                   'scene_id': scene_id, 'depth': depth, 'normals': normals, 'normals_mask': normals_mask,
156 |                   'auxiliary': self.auxiliary}
157 | 
158 |         sample = self.transform(sample)
159 | 
160 |         if self.auxiliary:
161 |             sample['depth'] = sample['depth'] * max_depth
162 | 
163 |         return sample
164 | 


--------------------------------------------------------------------------------
/rock/detect.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from pathlib import Path
  4 | from typing import Tuple, Optional
  5 | 
  6 | import torch
  7 | import torch.utils.data
  8 | 
  9 | import rock.ssd.prior_boxes
 10 | import rock.ssd.encoder
 11 | import rock.datasets.transforms
 12 | import rock.datasets.image_folder
 13 | import rock.model.network
 14 | import rock.utils.load
 15 | import rock.utils.draw
 16 | 
 17 | 
 18 | def object_detection(model_path: str,
 19 |                      image_folder_path: str = 'data/detection/images',
 20 |                      detection_output_path: str = 'data/detection/output',
 21 |                      scene_output_path: Optional[str] = None,
 22 |                      depth_output_path: Optional[str] = None,
 23 |                      normals_output_path: Optional[str] = None,
 24 |                      device: torch.device = torch.device("cuda"),
 25 |                      aux: bool = True,
 26 |                      aux_tasks: Tuple[str, ...] = ('scene', 'depth', 'normals'),
 27 |                      conf_threshold: float = 0.4,
 28 |                      throughput: bool = False,
 29 |                      verbose: bool = True) -> None:
 30 |     """ Loads a model and detects images at a given path
 31 |     """
 32 |     if detection_output_path:
 33 |         Path(detection_output_path).mkdir(parents=True, exist_ok=True)
 34 |     if scene_output_path:
 35 |         Path(scene_output_path).mkdir(parents=True, exist_ok=True)
 36 |     if depth_output_path:
 37 |         Path(depth_output_path).mkdir(parents=True, exist_ok=True)
 38 |     if normals_output_path:
 39 |         Path(normals_output_path).mkdir(parents=True, exist_ok=True)
 40 | 
 41 |     if verbose and not throughput:
 42 |         print("Running object detection with model: {}".format(model_path))
 43 | 
 44 |     if throughput:
 45 |         print("Calculating throughput disables saving detection output to folder")
 46 |     pboxes = rock.ssd.prior_boxes.pboxes_rock()
 47 |     encoder = rock.ssd.encoder.Encoder(pboxes)
 48 |     image_data = rock.datasets.image_folder.ImageFolder(image_folder_path)
 49 | 
 50 |     model = rock.model.network.rock_network(aux_tasks) if aux else rock.model.network.baseline_ssd()
 51 |     model = model.to(device)
 52 |     rock.utils.load.load_from_checkpoint(model_path, model, verbose=verbose)
 53 | 
 54 |     predict(model=model, dataset=image_data, encoder=encoder, device=device,
 55 |             conf_threshold=conf_threshold, detection_output_path=detection_output_path,
 56 |             scene_output_path=scene_output_path, depth_output_path=depth_output_path,
 57 |             normals_output_path=normals_output_path, aux=aux, aux_tasks=aux_tasks, throughput=throughput,
 58 |             verbose=verbose)
 59 | 
 60 |     if verbose and not throughput:
 61 |         print("Detections saved to: {}".format(detection_output_path))
 62 |         print("Done!")
 63 | 
 64 | 
 65 | def predict(model: torch.nn.Module,
 66 |             dataset: torch.utils.data.Dataset,
 67 |             encoder: rock.ssd.encoder.Encoder,
 68 |             detection_output_path: str,
 69 |             scene_output_path: str,
 70 |             depth_output_path: str,
 71 |             normals_output_path: str,
 72 |             device: torch.device,
 73 |             aux: bool,
 74 |             aux_tasks: Tuple[str, ...],
 75 |             conf_threshold: float,
 76 |             throughput: bool,
 77 |             verbose: bool) -> float:
 78 |     """ Performs object detection for a given model
 79 | 
 80 |     Returns the number of images evaluated per sec (forward pass) if show_images_per_sec is False, otherwise,
 81 |     prints the number of images evaluated per sec
 82 |     """
 83 |     model.eval()
 84 |     model.to(device)
 85 | 
 86 |     batch_size = 1 if throughput else 8
 87 |     loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=2, drop_last=False)
 88 | 
 89 |     total_images = len(dataset)
 90 |     total_time = 0
 91 | 
 92 |     for i, (imgs, filenames) in enumerate(loader):
 93 |         tic = time.time()
 94 |         with torch.no_grad():
 95 |             imgs = imgs.to(device)
 96 |             ploc, plabel, *aux_out = model(imgs)
 97 | 
 98 |             toc = time.time()
 99 |             total_time += (toc - tic)
100 | 
101 |             # Save images only if we are not checking the throughput
102 |             if not throughput:
103 |                 for j in range(imgs.shape[0]):
104 |                     save_path = os.path.join(detection_output_path, filenames[j])
105 |                     rock.utils.draw.draw_predictions(img=rock.utils.draw.inv_norm(imgs[j]),
106 |                                                      encoder=encoder, ploc=ploc, plabel=plabel, idx=j,
107 |                                                      label_map=rock.utils.draw.rock_label_map(), show=False,
108 |                                                      save_path=save_path, conf_threshold=conf_threshold)
109 | 
110 |                     if aux:
111 |                         if 'scene' in aux_tasks and scene_output_path:
112 |                             scene = aux_out[0]
113 |                             scene_save_path = os.path.join(scene_output_path, filenames[j])
114 |                             scene_save_path = os.path.splitext(scene_save_path)[0] + '.txt'
115 |                             rock.utils.draw.write_scenes(scene[j], scene_save_path, log=True)
116 | 
117 |                         if 'depth' in aux_tasks and depth_output_path:
118 |                             depth = aux_out[1]
119 |                             depth_save_path = os.path.join(depth_output_path, filenames[j])
120 |                             rock.utils.draw.draw_depth(depth[j], depth_save_path, log=True)
121 | 
122 |                         if 'normals' in aux_tasks and normals_output_path:
123 |                             normals = aux_out[2]
124 |                             normals_save_path = os.path.join(normals_output_path, filenames[j])
125 |                             rock.utils.draw.draw_normals(normals[j], normals_save_path)
126 | 
127 |         if verbose or throughput:
128 |             print("{}/{} images detected".format((i+1) * batch_size, total_images), end='\r')
129 | 
130 |     model.train()
131 | 
132 |     images_per_sec = total_images / total_time
133 | 
134 |     if throughput:
135 |         print()
136 |         print("Throughput: {:.2f} images/sec".format(images_per_sec))
137 |     elif verbose:
138 |         print("{}/{} images detected".format(total_images, total_images))
139 | 
140 |     return images_per_sec
141 | 


--------------------------------------------------------------------------------
/rock/model/detection.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Tuple, Optional
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | 
  7 | class Detection(nn.Module):
  8 |     """SSD detection layers
  9 | 
 10 |     Consists of 6 detection layers (by default), used for object detection and classification.
 11 |     """
 12 | 
 13 |     def __init__(self,
 14 |                  additional_layers: Optional[torch.nn.ModuleList] = None) -> None:
 15 |         """ Initializes the detection layers, uses the default additional layers if ``additional_layers`` is None
 16 |         """
 17 |         super().__init__()
 18 | 
 19 |         self.out_channels = [2048, 1024, 512, 256, 256, 256]
 20 |         self.num_priors = [4, 6, 6, 6, 4, 4]
 21 | 
 22 |         self.num_labels = 20  # number of ROCK classes
 23 | 
 24 |         if additional_layers:
 25 |             self.additional_layers = additional_layers
 26 |         else:
 27 |             self.additional_layers = self._build_default_additional_layers()
 28 | 
 29 |         self.loc, self.conf = self._build_loc_and_conf()
 30 | 
 31 |         self._init_weights()
 32 |         self._init_bias()
 33 | 
 34 |     def _build_default_additional_layers(self) -> nn.ModuleList:
 35 |         """ Constructs the default additional layers for the ROCK implementation on the NYUv2 dataset
 36 |         """
 37 |         additional_layers = []
 38 |         mid_layer_channels = [1024, 256, 128, 128, 128]
 39 | 
 40 |         for i, (input_size, output_size, channels) in enumerate(
 41 |                 zip(self.out_channels[:-1], self.out_channels[1:], mid_layer_channels)):
 42 |             if i <= 2:
 43 |                 layer = nn.Sequential(
 44 |                     nn.Conv2d(input_size, channels, kernel_size=1),
 45 |                     nn.BatchNorm2d(channels, momentum=0.01, track_running_stats=True),
 46 |                     nn.ReLU(inplace=True),
 47 |                     nn.Conv2d(channels, output_size, kernel_size=3, padding=1, stride=2),
 48 |                     nn.BatchNorm2d(output_size, momentum=0.01, track_running_stats=True),
 49 |                     nn.ReLU(inplace=True),
 50 |                 )
 51 |             elif i == 3:
 52 |                 layer = nn.Sequential(
 53 |                     nn.Conv2d(input_size, channels, kernel_size=1),
 54 |                     nn.BatchNorm2d(channels, momentum=0.01, track_running_stats=True),
 55 |                     nn.ReLU(inplace=True),
 56 |                     nn.Conv2d(channels, output_size, kernel_size=3),
 57 |                     nn.BatchNorm2d(output_size, momentum=0.01, track_running_stats=True),
 58 |                     nn.ReLU(inplace=True),
 59 |                 )
 60 |             else:
 61 |                 # This layer goes from (N, C_in, 2,3) to (N, C_out, 1,1).
 62 |                 # If input is of another shape (such as square), change the
 63 |                 # kernel size of this layer in order to obtain a (1, 1) output
 64 |                 layer = nn.Sequential(
 65 |                     nn.Conv2d(input_size, channels, kernel_size=1),
 66 |                     nn.BatchNorm2d(channels, momentum=0.01, track_running_stats=True),
 67 |                     nn.ReLU(inplace=True),
 68 |                     nn.Conv2d(channels, output_size, kernel_size=(2, 3)),
 69 |                     nn.BatchNorm2d(output_size, momentum=0.01, track_running_stats=True),
 70 |                     nn.ReLU(inplace=True),
 71 |                 )
 72 | 
 73 |             additional_layers.append(layer)
 74 | 
 75 |         return nn.ModuleList(additional_layers)
 76 | 
 77 |     def _build_loc_and_conf(self):
 78 |         loc = []
 79 |         conf = []
 80 |         for num_prior, out_channel in zip(self.num_priors, self.out_channels):
 81 |             loc.append(nn.Conv2d(out_channel, num_prior * 4, kernel_size=3, padding=1))
 82 |             conf.append(nn.Conv2d(out_channel, num_prior * self.num_labels, kernel_size=3, padding=1))
 83 | 
 84 |         loc = nn.ModuleList(loc)
 85 |         conf = nn.ModuleList(conf)
 86 | 
 87 |         return loc, conf
 88 | 
 89 |     def _init_weights(self) -> None:
 90 |         """ Weight initialization for additional layers
 91 |         """
 92 |         layers = [*self.additional_layers, *self.loc, *self.conf]
 93 |         for layer in layers:
 94 |             for param in layer.parameters():
 95 |                 # Switch from xavier_uniform_ to kaiming_normal_
 96 |                 if param.dim() > 1:
 97 |                     nn.init.kaiming_normal_(param, nonlinearity="relu")
 98 | 
 99 |     def _init_bias(self) -> None:
100 |         """ Bias initialization for confidence layers
101 | 
102 |         As indicated in the paper "Focal Loss for Dense Object Detection" https://arxiv.org/abs/1708.02002
103 |         """
104 | 
105 |         final_conf_layers = self._get_final_conf_layers()
106 |         for layer in final_conf_layers:
107 |             pi = 0.01
108 |             fg_value = -torch.log(torch.tensor((1 - pi) / pi)).item()
109 |             bias = layer.bias.data
110 |             bias = torch.zeros_like(bias)
111 |             bias = bias.reshape((self.num_labels, -1))
112 |             bias[1:, :] = fg_value
113 |             layer.bias.data = bias.reshape(-1)
114 | 
115 |     def _get_final_conf_layers(self) -> List[nn.Module]:
116 |         """ Gets the final conf layer (the one that outputs the result) for each detection layer
117 |         """
118 |         conf_layers = [*self.conf]
119 |         final_conf_layers = []
120 | 
121 |         for layer in conf_layers:
122 |             # Conf layer can either be one convolutional layer or a sequence of layers
123 |             if isinstance(layer, torch.nn.Sequential):
124 |                 final_conf_layers.append(layer[-1])
125 |             else:
126 |                 final_conf_layers.append(layer)
127 | 
128 |         return final_conf_layers
129 | 
130 |     def _reshape_bbox(self,
131 |                       detection_feed: List[torch.Tensor],
132 |                       loc: torch.nn.modules.container.ModuleList,
133 |                       conf: torch.nn.modules.container.ModuleList) -> Tuple[torch.Tensor, torch.Tensor]:
134 |         """Reshapes the input to correspond to the prior boxes
135 |         """
136 |         locs = []
137 |         confs = []
138 |         for x, l, c in zip(detection_feed, loc, conf):
139 |             loc_out = l(x).reshape((x.shape[0], 4, -1))
140 |             locs.append(loc_out)
141 | 
142 |             conf_out = c(x).reshape((x.shape[0], self.num_labels, -1))
143 |             confs.append(conf_out)
144 | 
145 |         locs = torch.cat(locs, dim=-1).contiguous()
146 |         confs = torch.cat(confs, dim=-1).contiguous()
147 |         return locs, confs
148 | 
149 |     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
150 |         """ Forward implementation of the SSD detection layers
151 | 
152 |         Shape:
153 |             - X: :math:`(N, C_in, H_in, W_in)` where N is the batch size
154 |             - Locs: :math:`(N, 4, num_priors)` where :math:`num_priors` is the number of priors
155 |             - Confs: :math:`(N, num_labels, num_priors)` where :math:`num_labels` is the number of object labels
156 |         |
157 |         For ROCK implementation on the NYUv2 dataset:
158 |             - :math:`C_in = 2048`
159 |             - :math:`(H_in, W_in)  = (30, 40)`
160 |             - :math:`num_priors = 7228`
161 |             - :math:`num_labels = 20`
162 |         """
163 |         detection_feed = [x]
164 |         for layer in self.additional_layers:
165 |             x = layer(x)
166 |             detection_feed.append(x)
167 | 
168 |         locs, confs = self._reshape_bbox(detection_feed, self.loc, self.conf)
169 | 
170 |         return locs, confs
171 | 
172 | 


--------------------------------------------------------------------------------
/rock/utils/draw.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, Optional
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import torchvision
  6 | from matplotlib import pyplot as plt, patches as patches
  7 | from torchvision import transforms as transforms
  8 | 
  9 | import rock.ssd.encoder
 10 | from rock.model import losses
 11 | 
 12 | 
 13 | def write_scenes(t: torch.Tensor, save_path: str, log: bool = True) -> None:
 14 |     """ Writes scene predictions in sorted order to a given file
 15 |     """
 16 |     scene_types = ['basement', 'bathroom', 'bedroom', 'bookstore', 'cafe', 'classroom', 'computer_lab',
 17 |                    'conference_room', 'dinette', 'dining_room', 'exercise_room', 'foyer', 'furniture_store',
 18 |                    'home_office', 'home_storage', 'indoor_balcony', 'kitchen', 'laundry_room', 'living_room',
 19 |                    'office', 'office_kitchen', 'playroom', 'printer_room', 'reception_room', 'student_lounge',
 20 |                    'study', 'study_room']
 21 | 
 22 |     if log:
 23 |         t = torch.exp(t)
 24 | 
 25 |     t = t.flatten().tolist()
 26 | 
 27 |     sorted_list = sorted(list(zip(scene_types, t)), key=(lambda x: x[1]), reverse=True)
 28 | 
 29 |     with open(save_path, 'w') as f:
 30 |         for i, (scene, value) in enumerate(sorted_list, start=1):
 31 |             print('{:02d}. {}: {:.3f}'.format(i, scene, value), file=f)
 32 | 
 33 | 
 34 | def draw_normals(t: torch.Tensor, save_path: str) -> None:
 35 |     """Shows the network's predicted normals for an image
 36 |     """
 37 | 
 38 |     if t.dim() == 4:
 39 |         t = losses.normalize(t)
 40 |     elif t.dim() == 3:
 41 |         t = t.unsqueeze(dim=0)
 42 |         t = losses.normalize(t)
 43 |         t = t.squeeze()
 44 | 
 45 |     t = (t + 1) / 2
 46 |     torchvision.utils.save_image(t, fp=save_path)
 47 | 
 48 | 
 49 | def draw_depth(t: torch.Tensor, save_path: str, log: bool = True) -> None:
 50 |     """Shows the network's predicted depth for an image
 51 |     """
 52 |     if log:
 53 |         t = torch.exp(t)
 54 | 
 55 |     max_depth, _ = torch.max(t.reshape(t.shape[0], -1), dim=-1)
 56 | 
 57 |     for _ in range(t.dim() - 1):
 58 |         max_depth = max_depth.unsqueeze(dim=-1)
 59 | 
 60 |     t = t / max_depth
 61 | 
 62 |     # Inverse colors
 63 |     t = 1 - t
 64 |     torchvision.utils.save_image(t, fp=save_path)
 65 | 
 66 | 
 67 | def draw_predictions(img: torch.Tensor,
 68 |                      encoder: rock.ssd.encoder.Encoder,
 69 |                      ploc: torch.Tensor,
 70 |                      plabel: torch.Tensor,
 71 |                      idx: int,
 72 |                      label_map: Dict[int, str],
 73 |                      show: bool = True,
 74 |                      save_path: Optional[str] = None,
 75 |                      conf_threshold: float = 0.0) -> None:
 76 |     """Shows an input image and the predicted bounding boxes and confidence
 77 |     """
 78 |     if label_map is None:
 79 |         label_map = {}
 80 |     img_height, img_width, _ = img.shape
 81 | 
 82 |     dec = encoder.decode_batch(ploc, plabel)[idx]
 83 | 
 84 |     pred_boxes, labels, confs = dec
 85 | 
 86 |     labels = np.array(labels)
 87 |     labels_num = None
 88 | 
 89 |     if label_map:
 90 |         labels_num = np.array(labels)
 91 |         labels = [label_map.get(label) for label in labels]
 92 | 
 93 |     fig, ax = plt.subplots(1)
 94 | 
 95 |     # If nothing predicted, don't add anything
 96 |     if not pred_boxes.shape[0] == 0:
 97 |         # Get box sizes in pixel and convert to l,t,w,h for COCOeval
 98 |         boxes = pred_boxes.cpu() * torch.tensor([img_width, img_height, img_width, img_height])
 99 |         l, t, r, b = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3]
100 |         cx, cy, w, h = (l + r) / 2, (t + b) / 2, r - l, b - t
101 |         cx, cy, w, h = np.array(cx), np.array(cy), np.array(w), np.array(h)
102 | 
103 |         bboxes = zip(cx, cy, w, h)
104 | 
105 |         cmap = plt.get_cmap('tab20b')
106 |         colors = [cmap(i) for i in np.linspace(0, 1, 20)]
107 | 
108 |         for (cx, cy, w, h), label, num_labels, conf in zip(bboxes, labels, labels_num, confs):
109 |             if label == "background" or conf.item() < conf_threshold:
110 |                 continue
111 | 
112 |             color = colors[num_labels]
113 |             bbox = patches.Rectangle((cx - 0.5 * w, cy - 0.5 * h), w, h,
114 |                                      linewidth=3, edgecolor=color, facecolor='none')
115 | 
116 |             ax.add_patch(bbox)
117 |             box_text = label + ' (' + str(round(conf.item() * 100, 1)) + '%)'
118 |             plt.text(cx - 0.5 * w, cy - 0.5 * h, s=box_text,
119 |                      color='white', verticalalignment='top',
120 |                      bbox={'color': color, 'pad': 0})
121 | 
122 |     img = img.cpu()
123 |     ax.imshow(img)
124 | 
125 |     if save_path:
126 |         plt.axis('off')
127 |         plt.savefig(save_path, bbox_inches='tight', pad_inches=0, dpi=150)
128 |         plt.axis('on')
129 | 
130 |     if show:
131 |         plt.show()
132 | 
133 |     plt.close(fig)
134 | 
135 | 
136 | def draw_transforms(img: torch.Tensor,
137 |                     bboxes: torch.Tensor,
138 |                     labels: torch.Tensor,
139 |                     label_map: Dict[int, str],
140 |                     show: bool = True,
141 |                     save_path: str = None) -> None:
142 |     """Draws transformed image
143 |     """
144 | 
145 |     # Modified from original draw patches method
146 |     # Suppose bboxes in fractional coordinates
147 |     if label_map is None:
148 |         label_map = {}
149 |     img = np.array(img)
150 |     labels = np.array(labels)
151 | 
152 |     bboxes = np.array(bboxes.numpy())
153 | 
154 |     labels_num = np.array(labels)
155 |     labels = [label_map.get(label) for label in labels]
156 | 
157 |     cx, cy, w, h = bboxes[0, :], bboxes[1, :], bboxes[2, :], bboxes[3, :]
158 | 
159 |     htot, wtot, _ = img.shape
160 |     cx *= wtot
161 |     cy *= htot
162 |     w *= wtot
163 |     h *= htot
164 | 
165 |     bboxes = zip(cx, cy, w, h)
166 | 
167 |     cmap = plt.get_cmap('tab20b')
168 |     colors = [cmap(i) for i in np.linspace(0, 1, 20)]
169 | 
170 |     fig, ax = plt.subplots(1)
171 | 
172 |     for (cx, cy, w, h), label, num_labels in zip(bboxes, labels, labels_num):
173 |         if label == "background":
174 |             continue
175 | 
176 |         color = colors[num_labels]
177 | 
178 |         bbox = patches.Rectangle((cx - 0.5 * w, cy - 0.5 * h), w, h,
179 |                                  linewidth=3, edgecolor=color, facecolor='none')
180 | 
181 |         ax.add_patch(bbox)
182 |         plt.text(cx - 0.5 * w, cy - 0.5 * h, s=label,
183 |                  color='white', verticalalignment='top',
184 |                  bbox={'color': color, 'pad': 0})
185 | 
186 |     ax.imshow(img)
187 | 
188 |     if save_path:
189 |         plt.axis('off')
190 |         plt.savefig(save_path, bbox_inches='tight', pad_inches=0, dpi=150)
191 |         plt.axis('on')
192 | 
193 |     if show:
194 |         plt.show()
195 | 
196 |     plt.close(fig)
197 | 
198 | 
199 | def inv_norm(tensor: torch.Tensor) -> torch.Tensor:
200 |     """Inverse of the normalization that was done during pre-processing
201 |     """
202 |     inv_normalize = transforms.Normalize(
203 |         mean=[-0.485 / 0.229, -0.456 / 0.224, -0.406 / 0.225],
204 |         std=[1 / 0.229, 1 / 0.224, 1 / 0.225])
205 | 
206 |     return inv_normalize(tensor).permute(1, 2, 0)
207 | 
208 | 
209 | def rock_label_map() -> Dict[int, str]:
210 |     """ Mapping from label num to label name
211 |     """
212 |     label_map = {0: 'background', 1: 'bathtub', 2: 'bed', 3: 'bookshelf', 4: 'box', 5: 'chair', 6: 'counter', 7: 'desk',
213 |                  8: 'door', 9: 'dresser', 10: 'garbage bin', 11: 'lamp', 12: 'monitor', 13: 'night stand', 14: 'pillow',
214 |                  15: 'sink', 16: 'sofa', 17: 'table', 18: 'television', 19: 'toilet'}
215 | 
216 |     return label_map
217 | 


--------------------------------------------------------------------------------
/rock/model/auxiliary.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple
  2 | 
  3 | import torch
  4 | import torch.nn.functional as F
  5 | from torch import nn
  6 | 
  7 | 
  8 | class Scene(nn.Module):
  9 |     """Module for scene predictions
 10 |     """
 11 |     def __init__(self, channels: int) -> None:
 12 |         super().__init__()
 13 | 
 14 |         self.channels = channels
 15 |         self.bn_out = nn.BatchNorm2d(2048, momentum=0.01, track_running_stats=True)
 16 | 
 17 |         self.scene_in = nn.Conv2d(in_channels=512, out_channels=self.channels, kernel_size=1)
 18 |         self.scene_out = nn.Conv2d(in_channels=self.channels, out_channels=2048, kernel_size=1)
 19 | 
 20 |         # Added test conv layers
 21 |         self.conv1 = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1)
 22 |         self.conv2 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
 23 |         self.conv3 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=1)
 24 |         self.bn1 = nn.BatchNorm2d(256, momentum=0.01, track_running_stats=True)
 25 |         self.bn2 = nn.BatchNorm2d(256, momentum=0.01, track_running_stats=True)
 26 |         self.bn3 = nn.BatchNorm2d(512, momentum=0.01, track_running_stats=True)
 27 | 
 28 |     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
 29 |         """
 30 |         Shape:
 31 |             - X: :math:`(N, C_in, H, W)` where :math:`C_in = 512`
 32 |             - Output: :math:`(N, C_out, H, W)` where :math:`C_out = 2048`
 33 |             - Scene pred: :math:`(N, num_scenes)` where :math:`num_scenes = 27`
 34 | 
 35 |         """
 36 |         # Added conv layers
 37 |         x = F.relu(self.bn1(self.conv1(x)))
 38 |         x = F.relu(self.bn2(self.conv2(x)))
 39 |         x = F.relu(self.bn3(self.conv3(x)))
 40 | 
 41 |         x = self.scene_in(x)
 42 | 
 43 |         pred = torch.mean(torch.flatten(x, start_dim=2), dim=-1)
 44 |         pred = nn.LogSoftmax(dim=-1)(pred)
 45 | 
 46 |         x = self.scene_out(x)
 47 |         x = self.bn_out(x)
 48 |         x = F.relu(x)
 49 | 
 50 |         return x, pred
 51 | 
 52 | 
 53 | class Depth(nn.Module):
 54 |     """ Module for depth prediction
 55 |     """
 56 | 
 57 |     def __init__(self, channels: int) -> None:
 58 |         super().__init__()
 59 | 
 60 |         self.channels = channels
 61 |         self.bn_out = nn.BatchNorm2d(2048, momentum=0.01, track_running_stats=True)
 62 | 
 63 |         self.depth_in = nn.Conv2d(in_channels=512, out_channels=self.channels, kernel_size=1)
 64 |         self.depth_out = nn.Conv2d(in_channels=self.channels, out_channels=2048, kernel_size=1)
 65 | 
 66 |         self.conv1 = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1)
 67 |         self.conv2 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
 68 |         self.conv3 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=1)
 69 |         self.bn1 = nn.BatchNorm2d(256, momentum=0.01, track_running_stats=True)
 70 |         self.bn2 = nn.BatchNorm2d(256, momentum=0.01, track_running_stats=True)
 71 |         self.bn3 = nn.BatchNorm2d(512, momentum=0.01, track_running_stats=True)
 72 | 
 73 |     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
 74 |         """
 75 |         Shape:
 76 |             - X: :math:`(N, C_in, H, W)` where :math:`C_in = 512`
 77 |             - Output: :math:`(N, C_out, H, W)` where :math:`C_out = 2048`
 78 |             - Depth pred: :math:`(N, 1, H, W)`
 79 |         """
 80 |         x = F.relu(self.bn1(self.conv1(x)))
 81 |         x = F.relu(self.bn2(self.conv2(x)))
 82 |         x = F.relu(self.bn3(self.conv3(x)))
 83 | 
 84 |         x = self.depth_in(x)
 85 | 
 86 |         # Shape is (N, C, H, W)
 87 |         pred = torch.mean(x, dim=1, keepdim=True)
 88 | 
 89 |         x = self.depth_out(x)
 90 |         x = self.bn_out(x)
 91 |         x = F.relu(x)
 92 | 
 93 |         return x, pred
 94 | 
 95 | 
 96 | class Normals(nn.Module):
 97 |     """Module for normal predictions
 98 |     """
 99 | 
100 |     def __init__(self, channels: int) -> None:
101 |         super().__init__()
102 | 
103 |         self.channels = channels
104 |         self.bn_out = nn.BatchNorm2d(2048, momentum=0.01, track_running_stats=True)
105 | 
106 |         self.normals_in = nn.Conv2d(in_channels=512, out_channels=self.channels, kernel_size=1)
107 |         self.normals_out = nn.Conv2d(in_channels=self.channels, out_channels=2048, kernel_size=1)
108 | 
109 |         self.conv1 = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1)
110 |         self.conv2 = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1)
111 |         self.conv3 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=1)
112 |         self.bn1 = nn.BatchNorm2d(256, momentum=0.01, track_running_stats=True)
113 |         self.bn2 = nn.BatchNorm2d(256, momentum=0.01, track_running_stats=True)
114 |         self.bn3 = nn.BatchNorm2d(512, momentum=0.01, track_running_stats=True)
115 | 
116 |     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
117 |         """
118 |         Shape:
119 |             - X: :math:`(N, C_in, H, W)` where :math:`C_in = 512`
120 |             - Output: :math:`(N, C_out, H, W)` where :math:`C_out = 2048`
121 |             - Normals pred: :math:`(N, 3, H, W)`
122 |         """
123 |         x = F.relu(self.bn1(self.conv1(x)))
124 |         x = F.relu(self.bn2(self.conv2(x)))
125 |         x = F.relu(self.bn3(self.conv3(x)))
126 | 
127 |         x = self.normals_in(x)
128 | 
129 |         normals_x, normals_y, normals_z = torch.split(x, round(self.channels / 3), dim=1)
130 |         normals_x = torch.mean(normals_x, dim=1)
131 |         normals_y = torch.mean(normals_y, dim=1)
132 |         normals_z = torch.mean(normals_z, dim=1)
133 |         pred = torch.stack([normals_x, normals_y, normals_z], dim=1)
134 | 
135 |         x = self.normals_out(x)
136 |         x = self.bn_out(x)
137 |         x = F.relu(x)
138 | 
139 |         return x, pred
140 | 
141 | 
142 | class ROCK(nn.Module):
143 |     """ ROCK block combining scene, depth and normals prediction
144 |     """
145 | 
146 |     def __init__(self,
147 |                  aux_tasks: Tuple[str] = ('scene', 'depth', 'normals')) -> None:
148 |         super().__init__()
149 | 
150 |         self.scene = 'scene' in aux_tasks
151 |         self.depth = 'depth' in aux_tasks
152 |         self.normals = 'normals' in aux_tasks
153 | 
154 |         self.backbone_channels = 2048
155 |         self.scene_channels = 27
156 |         self.depth_channels = 128
157 |         self.normals_channels = 3 * 128
158 | 
159 |         self.conv1 = nn.Conv2d(in_channels=self.backbone_channels, out_channels=512, kernel_size=1)
160 |         self.conv2 = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1)
161 | 
162 |         self.bn1 = nn.BatchNorm2d(512, momentum=0.01, track_running_stats=True)
163 |         self.bn2 = nn.BatchNorm2d(512, momentum=0.01, track_running_stats=True)
164 |         self.bn3 = nn.BatchNorm2d(self.backbone_channels, momentum=0.01, track_running_stats=True)
165 | 
166 |         self.scene_extractor = Scene(channels=self.scene_channels)
167 |         self.depth_extractor = Depth(channels=self.depth_channels)
168 |         self.normals_extractor = Normals(channels=self.normals_channels)
169 | 
170 |     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, ...]:
171 |         """
172 |         Shape:
173 |             - X:  :math:`(N, C_in, H, W)` where N is the batch size, and (H, W) is the feature map height and width
174 |             - Output: :math:`(N, C_out, H, W)` where :math:`C_out = C_in`
175 |             - Scene pred: :math:`(N, num_scenes)`
176 |             - Depth pred: :math:`(N, 1, H, W)`
177 |             - Normals pred: :math:`(N, 3, H, W)`
178 |         """
179 | 
180 |         identity = x
181 | 
182 |         x = F.relu(self.bn1(self.conv1(x)))
183 |         x = F.relu(self.bn2(self.conv2(x)))
184 | 
185 |         out = identity
186 |         aux_out = []
187 | 
188 |         if self.scene:
189 |             scene, scene_pred = self.scene_extractor(x)
190 |             aux_out.append(scene_pred)
191 |             out = out + scene
192 |         else:
193 |             aux_out.append(None)
194 | 
195 |         if self.depth:
196 |             depth, depth_pred = self.depth_extractor(x)
197 |             aux_out.append(depth_pred)
198 |             out = out + depth
199 |         else:
200 |             aux_out.append(None)
201 | 
202 |         if self.normals:
203 |             normals, normals_pred = self.normals_extractor(x)
204 |             aux_out.append(normals_pred)
205 |             out = out + normals
206 |         else:
207 |             aux_out.append(None)
208 | 
209 |         out = F.relu(self.bn3(out))
210 | 
211 |         return (out, *aux_out)
212 | 


--------------------------------------------------------------------------------
/rock/eval.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from pathlib import Path
  4 | from typing import Tuple, Dict, List, Any, Union
  5 | 
  6 | import torch
  7 | from pycocotools.coco import COCO
  8 | from pycocotools.cocoeval import COCOeval
  9 | from torch.utils.data import DataLoader
 10 | 
 11 | import rock.datasets.nyu_depth_v2
 12 | import rock.ssd.encoder
 13 | import rock.ssd.prior_boxes
 14 | import rock.datasets.transforms
 15 | import rock.model.network
 16 | import rock.utils.load
 17 | import rock.utils.hide_print
 18 | 
 19 | 
 20 | def evaluate_model(model_path: str,
 21 |                    test_path: str = 'data/train_test/nyuv2_test',
 22 |                    device: torch.device = torch.device("cuda"),
 23 |                    aux: bool = True,
 24 |                    aux_tasks: Tuple[str, ...] = ('scene', 'depth', 'normals'),
 25 |                    coco_json_save_path: str = 'data/eval/',
 26 |                    show_all_cats: bool = False,
 27 |                    verbose: bool = True) -> None:
 28 |     """ Loads a model and evaluates it
 29 |     """
 30 |     if verbose:
 31 |         print("Evaluating model: {}".format(model_path))
 32 | 
 33 |     pboxes = rock.ssd.prior_boxes.pboxes_rock()
 34 |     encoder = rock.ssd.encoder.Encoder(pboxes)
 35 |     test_trans = rock.datasets.transforms.Transformer(pboxes, (480, 640), train=False)
 36 |     test_data = rock.datasets.nyu_depth_v2.NYUv2Detection(test_path, transform=test_trans, auxiliary=aux)
 37 | 
 38 |     model = rock.model.network.rock_network(aux_tasks) if aux else rock.model.network.baseline_ssd()
 39 |     model = model.to(device)
 40 |     rock.utils.load.load_from_checkpoint(model_path, model, verbose=verbose)
 41 | 
 42 |     gt_path = os.path.join(coco_json_save_path, 'gt_box.json')
 43 |     dt_path = os.path.join(coco_json_save_path, 'pred_box.json')
 44 | 
 45 |     if verbose:
 46 |         ap1, ap2 = evaluate(model, test_data, encoder, device, gt_path=gt_path, dt_path=dt_path,
 47 |                             show_all_cats=show_all_cats)
 48 |     else:
 49 |         with rock.utils.hide_print.HiddenPrints():
 50 |             ap1, ap2 = evaluate(model, test_data, encoder, device, gt_path=gt_path, dt_path=dt_path,
 51 |                                 show_all_cats=show_all_cats)
 52 | 
 53 |     print('val mAP[0.50:0.95] = {:.4f}'.format(ap1))
 54 |     print('val mAP[0.50] = {:.4f}'.format(ap2))
 55 | 
 56 | 
 57 | def evaluate(model: torch.nn.Module,
 58 |              dataset: rock.datasets.nyu_depth_v2.NYUv2Detection,
 59 |              encoder: rock.ssd.encoder.Encoder,
 60 |              device: torch.device = torch.device("cuda"),
 61 |              gt_path: str = 'data/eval/gt_box.json',
 62 |              dt_path: str = 'data/eval/pred_box.json',
 63 |              max_output: int = 100,
 64 |              show_all_cats: bool = False) -> Tuple[float, float]:
 65 |     """ Evaluates the network's output using COCOeval (adapted for the NYUv2 dataset)
 66 | 
 67 |     |
 68 |     Prints out the mAP and related metrics for the given model on the given dataset
 69 | 
 70 |     Args:
 71 |         model: network
 72 |         dataset: dataset on which to run eval
 73 |         encoder: encoder use to encode / decode the network's output
 74 |         device: device on which to run eval (default: cuda)
 75 |         gt_path: save path for ground truths bounding boxes json file
 76 |         dt_path: save path for predicted bounding boxes json file
 77 |         max_output: maximum number of bounding boxes to consider per image (default: 100)
 78 |         show_all_cats: whether to show AP for all categories or just an average  (default: False)
 79 | 
 80 |     Returns:
 81 |         Average Precision (AP) @[ IoU=0.50:0.95] and AP @[ IoU=0.50]
 82 |     """
 83 |     # Put model in eval mode
 84 |     model.eval()
 85 | 
 86 |     _create_coco_files(model, dataset, encoder, device, gt_path, dt_path, max_output)
 87 | 
 88 |     cocoGt = COCO(gt_path)
 89 |     cocoDt = cocoGt.loadRes(dt_path)
 90 | 
 91 |     E = COCOeval(cocoGt, cocoDt, iouType='bbox')
 92 | 
 93 |     if not show_all_cats:
 94 |         E.evaluate()
 95 |         E.accumulate()
 96 |         E.summarize()
 97 |         print("Current AP: {:.5f}".format(E.stats[0]))
 98 | 
 99 |         # Put model back in training mode
100 |         model.train()
101 | 
102 |         return E.stats[0], E.stats[1]
103 | 
104 |     else:
105 |         # Evaluation by category
106 |         catIds = E.params.catIds
107 |         cats = dataset.categories
108 | 
109 |         for elem in catIds:
110 |             E.params.catIds = elem
111 | 
112 |             print("catId: " + str(elem))
113 |             print("catName: " + cats[elem])
114 |             E.evaluate()
115 |             E.accumulate()
116 |             E.summarize()
117 |             print()
118 | 
119 |         print("All catIds: " + str(catIds))
120 |         E.params.catIds = catIds
121 |         E.evaluate()
122 |         E.accumulate()
123 |         E.summarize()
124 |         print("Current AP: {:.5f}".format(E.stats[0]))
125 | 
126 |         # Put model back in training mode
127 |         model.train()
128 | 
129 |         return E.stats[0], E.stats[1]  # Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ]
130 | 
131 | 
132 | def _create_coco_files(model: torch.nn.Module,
133 |                        dataset: rock.datasets.nyu_depth_v2.NYUv2Detection,
134 |                        encoder: rock.ssd.encoder.Encoder,
135 |                        device: torch.device,
136 |                        gt_path: str,
137 |                        dt_path: str,
138 |                        max_output: int) -> None:
139 |     # Create paths if they don't exist
140 |     if not Path(gt_path).exists():
141 |         Path(os.path.dirname(gt_path)).mkdir(parents=True, exist_ok=True)
142 | 
143 |     if not Path(dt_path).exists():
144 |         Path(os.path.dirname(dt_path)).mkdir(parents=True, exist_ok=True)
145 | 
146 |     img_width, img_height = 640, 480
147 | 
148 |     gt_dict = _init_gt_dict(dataset)
149 |     dt_dict = []
150 | 
151 |     model.to(device)
152 | 
153 |     dataloader = DataLoader(dataset, batch_size=8, shuffle=False, num_workers=2)
154 |     for nbatch, sample in enumerate(dataloader, start=1):
155 |         print("Parsing batch: {}/{}".format(nbatch, len(dataloader)), end='\r')
156 |         with torch.no_grad():
157 |             inp = sample['img'].to(device)
158 |             img_id = sample['img_id']
159 | 
160 |             ploc, plabel, *aux_out = model(inp)
161 | 
162 |             for id in img_id.tolist():
163 | 
164 |                 gt_dict['images'].append({"id": id, "width": img_width, "height": img_height, "file_name": None})
165 | 
166 |                 boxes, labels = dataset.get_eval(id)
167 |                 for i, (box, label) in enumerate(zip(boxes, labels)):
168 |                     annot = {}
169 |                     annot['id'] = id * 100 + i
170 |                     annot['image_id'] = id
171 |                     annot['category_id'] = label
172 |                     annot['bbox'] = [int(elem) for elem in list(box)]
173 |                     annot['area'] = int(box[2]) * int(box[3])
174 |                     annot['iscrowd'] = 0
175 |                     annot['segmentation'] = None
176 |                     gt_dict['annotations'].append(annot)
177 | 
178 |             dec = encoder.decode_batch(ploc, plabel, max_output_num=max_output)
179 | 
180 |             for id, preds in zip(img_id.tolist(), dec):
181 |                 pred_boxes, labels, confs = preds
182 | 
183 |                 # If nothing predicted, don't add anything
184 |                 if pred_boxes.shape[0] == 0:
185 |                     continue
186 | 
187 |                 # Get box sizes in pixel and convert to l,t,w,h for COCOeval
188 |                 boxes = pred_boxes.cpu() * torch.tensor([img_width, img_height, img_width, img_height])
189 |                 l, t, w, h = boxes[:, 0], boxes[:, 1], boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1]
190 |                 boxes[:, 0] = l
191 |                 boxes[:, 1] = t
192 |                 boxes[:, 2] = w
193 |                 boxes[:, 3] = h
194 |                 boxes = torch.round(boxes * 10) / 10
195 | 
196 |                 for box, label, conf in zip(boxes.tolist(), labels.tolist(), confs.tolist()):
197 |                     annot = {}
198 |                     annot['image_id'] = id
199 |                     annot['category_id'] = label
200 |                     annot['bbox'] = box
201 |                     annot['score'] = conf
202 |                     dt_dict.append(annot)
203 | 
204 |     with open(gt_path, 'w') as outfile:
205 |         json.dump(gt_dict, outfile)
206 | 
207 |     with open(dt_path, 'w') as outfile:
208 |         json.dump(dt_dict, outfile)
209 | 
210 | 
211 | # noinspection PyDictCreation
212 | def _init_gt_dict(dataset: rock.datasets.nyu_depth_v2.NYUv2Detection) -> Dict[str, Union[List[Any], Dict[str, str]]]:
213 |     d = {}
214 |     d['info'] = {"description": "Subset of NYUv2 Dataset",
215 |                  "url": "",
216 |                  "version": "",
217 |                  "year": 2020,
218 |                  "contributor": "",
219 |                  "date_created": ""}
220 |     d['licenses'] = []
221 |     d['images'] = []
222 |     d['annotations'] = []
223 |     d['categories'] = []
224 | 
225 |     # Remove background as a category
226 |     for i, name in enumerate(dataset.categories[1:], start=1):
227 |         d['categories'].append({"supercategory": "home", "id": int(i), "name": name})
228 | 
229 |     return d
230 | 


--------------------------------------------------------------------------------
/rock/trainer.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | import time
  4 | from pathlib import Path
  5 | from typing import Optional, Tuple, Union, Iterable
  6 | 
  7 | import torch
  8 | import torch.optim as optim
  9 | from torch.utils.data import DataLoader
 10 | from torch.utils.tensorboard import SummaryWriter
 11 | 
 12 | import rock.datasets.nyu_depth_v2
 13 | import rock.model.network
 14 | import rock.eval
 15 | import rock.utils.load
 16 | import rock.utils.show
 17 | import rock.utils.hide_print
 18 | import rock.ssd.encoder
 19 | import rock.ssd.prior_boxes
 20 | import rock.datasets.transforms
 21 | import rock.model.losses
 22 | import rock.training
 23 | 
 24 | 
 25 | def train(train_path: str,
 26 |           val_path: Optional[str] = None,
 27 |           device: torch.device = torch.device("cuda"),
 28 |           num_iters: int = 30_000,
 29 |           lr: float = 5e-5,
 30 |           weight_decay: float = 2e-3,
 31 |           scheduler_milestones: Iterable[int] = (25_000, ),
 32 |           scheduler_gamma: float = 0.1,
 33 |           forced_crops: bool = False,
 34 |           aux: bool = True,
 35 |           aux_tasks: Tuple[str, ...] = ('scene', 'depth', 'normals'),
 36 |           use_all_priors_conf_loss: bool = False,
 37 |           writer_path: Optional[str] = None,
 38 |           save_path: Optional[str] = None,
 39 |           checkpoint_path: Optional[str] = None,
 40 |           coco_json_save_path: str = 'data/eval',
 41 |           save_best_on_val: bool = False,
 42 |           val_eval_freq: Union[int, None] = 10,
 43 |           train_eval_freq: Union[int, None] = 50,
 44 |           image_to_tb_freq: Union[int, None] = 20,
 45 |           model_save_freq: Union[int, None] = None,
 46 |           verbose: bool = True) -> None:
 47 |     # Initialize dataset and model
 48 |     if save_path:
 49 |         Path(save_path).mkdir(parents=True, exist_ok=True)
 50 |         save_path = os.path.join(save_path, '')
 51 | 
 52 |     if writer_path:
 53 |         writer_path = os.path.join(writer_path, '')
 54 | 
 55 |     pboxes = rock.ssd.prior_boxes.pboxes_rock()
 56 |     encoder = rock.ssd.encoder.Encoder(pboxes)
 57 | 
 58 |     train_trans = rock.datasets.transforms.Transformer(pboxes, (480, 640), train=True, forced_crops=forced_crops)
 59 |     test_trans = rock.datasets.transforms.Transformer(pboxes, (480, 640), train=False)
 60 | 
 61 |     train_data = rock.datasets.nyu_depth_v2.NYUv2Detection(train_path, transform=train_trans, auxiliary=aux)
 62 |     train_eval_data = rock.datasets.nyu_depth_v2.NYUv2Detection(train_path, transform=test_trans, auxiliary=aux)
 63 |     train_loader = torch.utils.data.DataLoader(train_data, batch_size=8, shuffle=True, num_workers=2, drop_last=True)
 64 | 
 65 |     if val_path:
 66 |         val_data = rock.datasets.nyu_depth_v2.NYUv2Detection(val_path, transform=test_trans, auxiliary=aux)
 67 |         val_loader = torch.utils.data.DataLoader(val_data, batch_size=8, shuffle=False, num_workers=2, drop_last=True)
 68 |     else:
 69 |         val_data = None
 70 |         val_loader = None
 71 | 
 72 |     gt_path = os.path.join(coco_json_save_path, 'gt_box.json')
 73 |     dt_path = os.path.join(coco_json_save_path, 'pred_box.json')
 74 | 
 75 |     label_map = train_data.label_map
 76 | 
 77 |     train_len = len(train_loader)
 78 | 
 79 |     epochs = math.ceil(num_iters / train_len)
 80 |     model = rock.model.network.rock_network(aux_tasks) if aux else rock.model.network.baseline_ssd()
 81 |     model = model.to(device)
 82 |     loss_func = rock.model.losses.Loss(pboxes,
 83 |                                        auxiliary=aux,
 84 |                                        aux_tasks=aux_tasks,
 85 |                                        use_all_priors_conf_loss=use_all_priors_conf_loss).to(device)
 86 | 
 87 |     optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
 88 |     scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=scheduler_milestones, gamma=scheduler_gamma)
 89 | 
 90 |     writer = SummaryWriter(writer_path) if writer_path is not None else None
 91 | 
 92 |     total_time = 0
 93 |     iteration = 0
 94 |     total_iters = 0
 95 |     start_epoch = 0
 96 |     best_ap = -1.0
 97 |     ap1, ap2 = 0.0, 0.0
 98 | 
 99 |     # Loading model
100 |     if checkpoint_path:
101 |         start_epoch, iteration = rock.utils.load.load_from_checkpoint(checkpoint_path,
102 |                                                                       model, optimizer, scheduler, verbose)
103 | 
104 |         if verbose:
105 |             print("Resuming training of model stored at: {}".format(checkpoint_path))
106 |             print(scheduler.state_dict())
107 | 
108 |     if verbose:
109 |         if aux:
110 |             print("ROCK block enabled")
111 |         else:
112 |             print("ROCK block disabled")
113 | 
114 |         if val_path:
115 |             print("Training on {}, evaluating on {}".format(train_path, val_path))
116 |         else:
117 |             print("Training on {}, no val set".format(train_path))
118 | 
119 |     print("Training the model for {} epochs ({} iters) \n".format(epochs, num_iters))
120 | 
121 |     ###############
122 |     # Training loop
123 |     for epoch in range(start_epoch, start_epoch + epochs):
124 |         start_epoch_time = time.time()
125 | 
126 |         train_loss, iteration = rock.training.train_loop(model, loss_func, train_loader, epoch, iteration,
127 |                                                          optimizer, scheduler, writer=writer, device=device)
128 | 
129 |         if val_loader:
130 |             val_loss = rock.training.val_loop(model, loss_func, val_loader, epoch, writer=writer, device=device)
131 |         else:
132 |             val_loss = None
133 | 
134 |         out_str = '[Epoch {}] train loss: {:.4f}'.format(epoch, train_loss)
135 |         if val_loss:
136 |             out_str += ' / val loss: {:.4f}'.format(val_loss)
137 | 
138 |         if verbose:
139 |             print(out_str)
140 |         else:
141 |             if ap1 > 0.0:
142 |                 out_str += ' / latest val mAP[0.50:0.95]: {:.4f}'.format(ap1)
143 |             print(out_str, end='\r')
144 | 
145 |         end_epoch_time = time.time() - start_epoch_time
146 |         total_time += end_epoch_time
147 | 
148 |         ########################
149 |         # Evaluate and save model
150 | 
151 |         # Val data eval
152 |         if val_data and val_eval_freq and epoch % val_eval_freq == 0:
153 |             if verbose:
154 |                 print()
155 |                 print("[Val] Epoch {} eval".format(epoch))
156 |                 ap1, ap2 = rock.eval.evaluate(model, val_data, encoder, device, gt_path=gt_path, dt_path=dt_path)
157 |             else:
158 |                 with rock.utils.hide_print.HiddenPrints():
159 |                     ap1, ap2 = rock.eval.evaluate(model, val_data, encoder, device, gt_path=gt_path, dt_path=dt_path)
160 | 
161 |             if writer:
162 |                 writer.add_scalar('val mAP[0.50:0.95]', ap1, epoch)
163 |                 writer.add_scalar('val mAP[0.50]', ap2, epoch)
164 | 
165 |             if verbose:
166 |                 print('val mAP[0.50:0.95] = {:.4f}'.format(ap1))
167 |                 print('val mAP[0.50] = {:.4f}'.format(ap2))
168 |                 print()
169 | 
170 |             # Save model if it has the best ap
171 |             if save_path and save_best_on_val:
172 |                 if (ap1 + ap2) > best_ap:
173 |                     best_ap = ap1 + ap2
174 |                     obj = {'epoch': epoch+1, 'iteration': iteration, 'optimizer': optimizer.state_dict(),
175 |                            'scheduler': scheduler.state_dict(),
176 |                            'model': model.state_dict()}
177 |                     torch.save(obj, '{}best_model.pt'.format(save_path))
178 | 
179 |                     if verbose:
180 |                         print("new best mAP, saved model at epoch {}".format(epoch))
181 | 
182 |         # Train data eval
183 |         if train_eval_freq and epoch % train_eval_freq == 0:
184 |             if verbose:
185 |                 print()
186 |                 print("[Train] Epoch {} eval".format(epoch))
187 |                 train_ap1, train_ap2 = rock.eval.evaluate(model, train_eval_data, encoder, device, gt_path=gt_path,
188 |                                                           dt_path=dt_path)
189 |             else:
190 |                 with rock.utils.hide_print.HiddenPrints():
191 |                     train_ap1, train_ap2 = rock.eval.evaluate(model, train_eval_data, encoder, device, gt_path=gt_path,
192 |                                                               dt_path=dt_path)
193 | 
194 |             if writer:
195 |                 writer.add_scalar('train mAP[0.50:0.95]', train_ap1, epoch)
196 |                 writer.add_scalar('train mAP[0.50]', train_ap2, epoch)
197 | 
198 |             if verbose:
199 |                 print('train mAP[0.50:0.95] = {:.4f}'.format(train_ap1))
200 |                 print('train mAP[0.50] = {:.4f}'.format(train_ap2))
201 |                 print()
202 | 
203 |         # Upload images to tensorboard
204 |         if writer and image_to_tb_freq and epoch % image_to_tb_freq == 0:
205 |             train_eval_grid = rock.utils.show.predict_grid(model, train_eval_data, encoder, label_map)
206 |             writer.add_image("train_grid", train_eval_grid, epoch)
207 | 
208 |             if val_data:
209 |                 val_grid = rock.utils.show.predict_grid(model, val_data, encoder, label_map)
210 |                 writer.add_image("val_grid", val_grid, epoch)
211 | 
212 |             train_grid = rock.utils.show.predict_grid(model, train_data, encoder, label_map)
213 |             writer.add_image("train_grid (with crops)", train_grid, epoch)
214 | 
215 |         # Save model
216 |         if save_path and model_save_freq and epoch % model_save_freq == 0:
217 |             obj = {'epoch': epoch+1, 'iteration': iteration, 'optimizer': optimizer.state_dict(),
218 |                    'scheduler': scheduler.state_dict(),
219 |                    'model': model.state_dict()}
220 |             torch.save(obj, '{}epoch_{}.pt'.format(save_path, epoch))
221 |             if verbose:
222 |                 print("saved model at epoch {}".format(epoch))
223 | 
224 |         total_iters = iteration
225 | 
226 |     # End of training
227 |     if save_path:
228 |         obj = {'epoch': start_epoch + epochs, 'iteration': total_iters, 'optimizer': optimizer.state_dict(),
229 |                'scheduler': scheduler.state_dict(),
230 |                'model': model.state_dict()}
231 |         torch.save(obj, '{}final_model.pt'.format(save_path))
232 |         print()
233 |         print("saved final model")
234 | 
235 |     if val_data:
236 |         print("Final model eval")
237 | 
238 |         if verbose:
239 |             ap1, ap2 = rock.eval.evaluate(model, val_data, encoder, device, gt_path=gt_path, dt_path=dt_path,
240 |                                           show_all_cats=True)
241 |         else:
242 |             with rock.utils.hide_print.HiddenPrints():
243 |                 ap1, ap2 = rock.eval.evaluate(model, val_data, encoder, device, gt_path=gt_path, dt_path=dt_path,
244 |                                               show_all_cats=True)
245 | 
246 |         print('val mAP[0.50:0.95] = {:.4f}'.format(ap1))
247 |         print('val mAP[0.50] = {:.4f}'.format(ap2))
248 |         print()
249 | 
250 |     print('Total training time: {}'.format(total_time))
251 | 
252 |     if writer:
253 |         writer.close()
254 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2020 EPFL/VITA
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/rock/prep.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | from pathlib import Path
  4 | from typing import Tuple, Optional
  5 | 
  6 | import h5py
  7 | import numpy
  8 | import numpy as np
  9 | from PIL import Image
 10 | from scipy.io import loadmat
 11 | 
 12 | 
 13 | def prep_data(dataset_path: str,
 14 |               splits_path: str,
 15 |               normals_path: str,
 16 |               train_save_path: str,
 17 |               test_save_path: str,
 18 |               val_save_path: str,
 19 |               val_split_path: Optional[str] = None,
 20 |               verbose: bool = True) -> None:
 21 |     """ Prepares data (usually from given command-line arguments)
 22 |     """
 23 |     if verbose:
 24 |         if val_split_path:
 25 |             print("Train/val/test split")
 26 |         else:
 27 |             print("Train/test split")
 28 | 
 29 |         print("Beginning preprocessing...")
 30 | 
 31 |     dataset = NYUv2Preprocessing(dataset_path, splits_path, normals_path)
 32 | 
 33 |     if val_split_path:
 34 |         dataset.add_val(val_split_path)
 35 | 
 36 |     dataset.save(train_save_path, subset='train')
 37 |     if verbose:
 38 |         print("Saved train set to: {}".format(train_save_path))
 39 | 
 40 |     dataset.save(test_save_path, subset='test')
 41 |     if verbose:
 42 |         print("Saved test set to: {}".format(test_save_path))
 43 | 
 44 |     if val_split_path:
 45 |         dataset.save(val_save_path, subset='val')
 46 |         if verbose:
 47 |             print("Saved val set to: {}".format(val_save_path))
 48 | 
 49 |     if verbose:
 50 |         print("Done!")
 51 | 
 52 | 
 53 | class NYUv2Preprocessing(object):
 54 |     """Pre-processes the NYUv2 dataset
 55 |     Parses .mat files from the NYUv2 dataset, extracts necessary info
 56 |     and finds bounding boxes
 57 |     """
 58 | 
 59 |     def __init__(self, dataset_path: str, splits_path: str, normals_path: str) -> None:
 60 |         self.in_f = h5py.File(dataset_path, 'r')
 61 |         self.nyuv2 = {}
 62 | 
 63 |         for name, data in self.in_f.items():
 64 |             self.nyuv2[name] = data
 65 | 
 66 |         self.imgs, self.depths, self.labels, self.label_instances = self.__get_arrs()
 67 | 
 68 |         self.len = self.imgs.shape[0]
 69 | 
 70 |         self.scene_types = self.__read_mat_text(self.nyuv2['sceneTypes'])
 71 |         self.class_list = self.__read_mat_text(self.nyuv2['names'])
 72 |         self.scenes = self.__read_mat_text((self.nyuv2['scenes']))
 73 | 
 74 |         self.scene_ids, self.unique_scenes = self.__get_scene_ids()
 75 | 
 76 |         self.bboxes, self.rock_classes = self._get_all_bboxes()
 77 | 
 78 |         self.train_idx, self.test_idx = self._splits(splits_path)
 79 | 
 80 |         self.val = False
 81 |         self.val_idx = []
 82 | 
 83 |         self.masks, self.normals = get_surface_normals(normals_path)
 84 | 
 85 |     def save(self, path: str, subset: str = 'all') -> None:
 86 |         """Saves a specified subset of the data at a given folder path.
 87 |         
 88 |         Subset can be `train`, `test`, `val` or `all`.
 89 |         """
 90 |         Path(path).mkdir(parents=True, exist_ok=True)
 91 | 
 92 |         if subset == 'train':
 93 |             self._save_subset(path, self.train_idx, digits=4)
 94 |         elif subset == 'test':
 95 |             self._save_subset(path, self.test_idx, digits=4)
 96 |         elif subset == 'val':
 97 |             self._save_subset(path, self.val_idx, digits=4)
 98 |         elif subset == 'all':
 99 |             self._save_subset(path, range(self.len), digits=4)
100 |         else:
101 |             print("Couldn't find specified subset")
102 | 
103 |     def add_val(self, path: str) -> None:
104 |         """Adds validation set using the path of a file containing the list of scenes part of the validation set
105 |         """
106 | 
107 |         if not self.val:
108 |             with open(path, 'r') as f:
109 |                 y = f.read().splitlines()
110 | 
111 |             for i, elem in enumerate(self.scenes):
112 |                 if elem in y:
113 |                     self.val_idx.append(i)
114 | 
115 |             self.train_idx = [i for i in self.train_idx if i not in self.val_idx]
116 |             self.val = True
117 | 
118 |     def _save_elem(self, path: str, idx: int) -> None:
119 |         """ Save a specified data sample at a given path
120 |         """
121 |         d = {}
122 |         d['img'] = self.imgs[idx]
123 |         d['depth'] = self.depths[idx]
124 |         d['labels'] = self.labels[idx]
125 |         d['label_instance'] = self.label_instances[idx]
126 |         d['scene_type'] = self.scene_types[idx]
127 |         d['scene'] = self.scenes[idx]
128 |         d['scene_id'] = self.scene_ids[idx]
129 |         d['normals'] = self.normals[idx]
130 |         d['mask'] = self.masks[idx]
131 |         d['bboxes'] = self.bboxes[idx]
132 |         d['rock_classes'] = self.rock_classes
133 |         d['unique_scenes'] = self.unique_scenes
134 | 
135 |         with open(path, 'wb') as handle:
136 |             pickle.dump(d, handle, protocol=pickle.HIGHEST_PROTOCOL)
137 | 
138 |     def _save_subset(self, path, idx, digits=4):
139 |         """ Save a specified subset of the data at a given path
140 |         """
141 |         for elem in idx:
142 |             filename = str(elem).rjust(digits, '0') + '.pkl'
143 |             file_path = os.path.join(path, filename)
144 |             self._save_elem(file_path, elem)
145 | 
146 |     @staticmethod
147 |     def _splits(splits_path):
148 |         """ Splits the dataset into a test set and training set
149 |         """
150 |         splits = loadmat(splits_path)
151 | 
152 |         train_splits = splits['trainNdxs'] - 1
153 |         test_splits = splits['testNdxs'] - 1
154 | 
155 |         train_idx = [elem.item() for elem in train_splits]
156 |         test_idx = [elem.item() for elem in test_splits]
157 | 
158 |         return train_idx, test_idx
159 | 
160 |     @staticmethod
161 |     def _transpose_3d_from_mat(data):
162 |         """ Transposes for .mat array format to numpy array format
163 |         """
164 |         elem_list = [np.transpose(elem, (2, 1, 0)) for elem in data]
165 |         elems = np.stack(elem_list, axis=0)
166 |         return elems
167 | 
168 |     @staticmethod
169 |     def _transpose_2d_from_mat(data):
170 |         """ Transposes for .mat array format to numpy array format
171 |         """
172 |         elem_list = [np.transpose(elem, (1, 0)) for elem in data]
173 |         elems = np.stack(elem_list, axis=0)
174 |         return elems
175 | 
176 |     def __get_arrs(self):
177 |         """ Gets the images, depths, labels and label_instances as numpy arrays
178 |         """
179 |         imgs = self._transpose_3d_from_mat(self.nyuv2['images'])
180 |         depths = self._transpose_2d_from_mat(self.nyuv2['depths'])
181 |         labels = self._transpose_2d_from_mat(self.nyuv2['labels'])
182 |         label_instances = self._transpose_2d_from_mat(self.nyuv2['instances'])
183 | 
184 |         return imgs, depths, labels, label_instances
185 | 
186 |     def __read_mat_text(self, h5_dataset):
187 |         """ Reads text from a .mat file
188 |         """
189 |         item_list = [u''.join(chr(c.item()) for c in self.in_f[obj_ref]) for obj_ref in (h5_dataset[0, :])]
190 |         return item_list
191 | 
192 |     def __get_scene_ids(self):
193 |         """ Obtains the scene ids for each sample
194 |         """
195 |         unique_scenes = sorted(list(set(self.scene_types)))
196 | 
197 |         scene_type_to_id = {}
198 |         for i, scene in enumerate(unique_scenes):
199 |             scene_type_to_id[scene] = i
200 | 
201 |         scene_ids = [scene_type_to_id[scene_type] for scene_type in self.scene_types]
202 | 
203 |         return scene_ids, unique_scenes
204 | 
205 |     @staticmethod
206 |     def _get_rock_class(names_to_ids):
207 |         """ Obtains the object classes and ids for the ROCK paper
208 |         """
209 | 
210 |         rock_class_names = ['bathtub', 'bed', 'bookshelf', 'box', 'chair', 'counter', 'desk',
211 |                             'door', 'dresser', 'garbage bin', 'lamp', 'monitor', 'night stand',
212 |                             'pillow', 'sink', 'sofa', 'table', 'television', 'toilet']
213 | 
214 |         rock_class_ids = [names_to_ids[name] for name in rock_class_names]
215 | 
216 |         return rock_class_names, rock_class_ids
217 | 
218 |     @staticmethod
219 |     def _get_image_instances(labels, label_instances, class_ids):
220 |         """ Obtains the object instances
221 |         """
222 |         # Keep only the objects from the indicated classes
223 |         masks = np.isin(labels, class_ids)
224 |         instances_masked = label_instances * masks
225 |         labels_masked = labels * masks
226 | 
227 |         # Create new array of the same shape as the original instance array
228 |         arr = np.zeros(labels.shape)
229 | 
230 |         # For each image
231 |         for i in range(labels.shape[0]):
232 |             count = 0
233 | 
234 |             # Get the classes of that image
235 |             classes = np.unique(labels_masked[i, :, :])
236 |             classes = classes[classes != i]
237 | 
238 |             # Map each instance of each class to a unique instance number
239 |             for elem in classes:
240 |                 class_instances = instances_masked[i, :, :] * (labels_masked[i, :, :] == elem)
241 | 
242 |                 for j in range(1, class_instances.max() + 1):
243 |                     count += 1
244 |                     arr[i, :, :][class_instances == j] = count
245 | 
246 |         return arr.astype(int)
247 | 
248 |     # noinspection PyDictCreation
249 |     @staticmethod
250 |     def _bbox(instances, labels, ids_to_names, class_names):
251 |         """Obtains the bounding boxes for an image
252 |         """
253 | 
254 |         bbox_list = []
255 |         for (i, instance) in enumerate(instances):
256 |             img_bbox_list = []
257 |             # noinspection PyDictCreation
258 |             for j in range(1, instance.max() + 1):
259 |                 a = np.where(instance == j)
260 |                 bbox = {}
261 |                 bbox['box'] = np.min(a[1]), np.min(a[0]), np.max(a[1]), np.max(a[0])  # x1, y1, x2, y2
262 |                 old_label = labels[i, a[0][0], a[1][0]]
263 |                 bbox['label_name'] = ids_to_names[old_label]
264 | 
265 |                 # Remap to new indices (from 1 to 20, in alphabetical order)
266 |                 bbox['labels'] = class_names.index(bbox['label_name'])
267 |                 img_bbox_list.append(bbox)
268 |             bbox_list.append(img_bbox_list)
269 | 
270 |         return bbox_list
271 | 
272 |     def _get_all_bboxes(self):
273 |         """ Obtains all bounding boxes for the specified rock classes
274 |         """
275 | 
276 |         labels, label_instances, class_list = self.labels, self.label_instances, self.class_list
277 |         names_to_ids = {}
278 |         ids_to_names = {}
279 |         for i, name in enumerate(class_list, start=1):
280 |             names_to_ids[name] = i
281 |             ids_to_names[i] = name
282 | 
283 |         rock_class_names, rock_class_ids = self._get_rock_class(names_to_ids)
284 | 
285 |         # Add background class as the first index
286 |         rock_class_names.insert(0, "background")
287 | 
288 |         rock_instances = self._get_image_instances(labels, label_instances, rock_class_ids)
289 | 
290 |         bboxes = self._bbox(rock_instances, labels, ids_to_names, rock_class_names)
291 | 
292 |         return bboxes, rock_class_names
293 | 
294 | 
295 | def get_surface_normals(path: str) -> Tuple[numpy.ndarray, numpy.ndarray]:
296 |     """Obtains arrays of surface normals and normals mask arrays from input image
297 | 
298 |     Args:
299 |         path (str): path of the folder containing folders of normals and masks
300 | 
301 |     Returns:
302 |         (tuple): tuple containing:
303 |             masks (numpy.ndarray): array of image masks
304 |             normals (numpy.ndarray): list of normals
305 |     """
306 | 
307 |     masks_path = os.path.join(path, "masks")
308 |     normals_path = os.path.join(path, "normals")
309 | 
310 |     masks_files = sorted([os.path.join(masks_path, file) for file in os.listdir(masks_path) if file.endswith(".png")])
311 |     normals_files = sorted(
312 |         [os.path.join(normals_path, file) for file in os.listdir(normals_path) if file.endswith(".png")])
313 | 
314 |     masks = np.stack([np.array(Image.open(file)) for file in masks_files], axis=0)
315 |     normals = np.stack([(np.array(Image.open(file))) for file in normals_files], axis=0)
316 | 
317 |     return masks, normals
318 | 
319 | 
320 | 
321 | 


--------------------------------------------------------------------------------
/rock/ssd/encoder.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple, List, Union
  2 | 
  3 | import torch
  4 | import torch.nn.functional as F
  5 | 
  6 | import rock.ssd.prior_boxes
  7 | 
  8 | DecoderOutput = Union[List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]], None]
  9 | 
 10 | 
 11 | def iou(box1: torch.Tensor, box2: torch.Tensor) -> torch.Tensor:
 12 |     """Compute the intersection over union of two set of boxes, each box is [x1,y1,x2,y2].
 13 | 
 14 |     Shape:
 15 |     - box1: :math:`(N, 4)`
 16 |     - box2: :math:`(M, 4)`
 17 |     - Output: :math:`(N, M)`
 18 | 
 19 |     Modified from: https://github.com/kuangliu/pytorch-ssd
 20 |    """
 21 |     N = box1.shape[0]
 22 |     M = box2.shape[0]
 23 | 
 24 |     lt = torch.max(
 25 |         box1[:, :2].unsqueeze(1).expand(N, M, 2),  # [N,2] -> [N,1,2] -> [N,M,2]
 26 |         box2[:, :2].unsqueeze(0).expand(N, M, 2),  # [M,2] -> [1,M,2] -> [N,M,2]
 27 |     )
 28 | 
 29 |     rb = torch.min(
 30 |         box1[:, 2:].unsqueeze(1).expand(N, M, 2),  # [N,2] -> [N,1,2] -> [N,M,2]
 31 |         box2[:, 2:].unsqueeze(0).expand(N, M, 2),  # [M,2] -> [1,M,2] -> [N,M,2]
 32 |     )
 33 | 
 34 |     wh = rb - lt  # [N,M,2]
 35 |     wh[wh < 0] = 0  # clip at 0
 36 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
 37 | 
 38 |     area1 = (box1[:, 2] - box1[:, 0]) * (box1[:, 3] - box1[:, 1])  # [N,]
 39 |     area2 = (box2[:, 2] - box2[:, 0]) * (box2[:, 3] - box2[:, 1])  # [M,]
 40 |     area1 = area1.unsqueeze(1).expand_as(inter)  # [N,] -> [N,1] -> [N,M]
 41 |     area2 = area2.unsqueeze(0).expand_as(inter)  # [M,] -> [1,M] -> [N,M]
 42 | 
 43 |     iou_out = inter / (area1 + area2 - inter)
 44 |     return iou_out
 45 | 
 46 | 
 47 | class Encoder:
 48 |     """ Encodes and decodes from (coordinates, labels) bounding boxes to SSD prior boxes
 49 | 
 50 |     Code modified from:
 51 |         - https://github.com/lufficc/SSD
 52 |         - https://github.com/kuangliu/pytorch-ssd
 53 |         - https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Detection/SSD
 54 |         - https://github.com/amdegroot/ssd.pytorch
 55 |     """
 56 | 
 57 |     def __init__(self, pboxes: rock.ssd.prior_boxes.PriorBoxes) -> None:
 58 |         self.pboxes = pboxes(order="ltrb")
 59 |         self.pboxes_xywh = pboxes(order="xywh").unsqueeze(dim=0)
 60 |         self.num_priors = self.pboxes.shape[0]
 61 |         self.variance_xy = pboxes.variance_xy
 62 |         self.variance_wh = pboxes.variance_wh
 63 | 
 64 |     def area_of(self,
 65 |                 left_top: torch.Tensor,
 66 |                 right_bottom: torch.Tensor) -> torch.Tensor:
 67 |         """Compute the areas of rectangles given two corners.
 68 | 
 69 |         Shape:
 70 |             - left_top: :math:`(*, 2)` where * means any number of dimensions
 71 |             - right_bottom: :math:`(*, 2)` where * from left_top = * from right_bottom
 72 |             - area (out): :math:`(*)` where * from out = * from left_top
 73 | 
 74 |         :math:`*` is a constant here and is the same for left_top, right_bottom and out
 75 | 
 76 |          Modified from: https://github.com/lufficc/SSD
 77 |         """
 78 |         hw = torch.clamp(right_bottom - left_top, min=0.0)
 79 |         return hw[..., 0] * hw[..., 1]
 80 | 
 81 |     def iou_of(self,
 82 |                boxes0: torch.Tensor,
 83 |                boxes1: torch.Tensor, eps: float = 1e-5) -> torch.Tensor:
 84 |         """Return intersection-over-union (Jaccard index) of boxes.
 85 | 
 86 |         Shape:
 87 |             - boxes0: :math:`(N1, M1, 4)`
 88 |             - boxes1 :math: (N2, M2, 4)
 89 |             - out: :math:`(N, M)` where N = max(N1, N2) and M = max(M1, M2)
 90 | 
 91 |         Modified from: https://github.com/lufficc/SSD
 92 |         """
 93 |         overlap_left_top = torch.max(boxes0[..., :2], boxes1[..., :2])
 94 |         overlap_right_bottom = torch.min(boxes0[..., 2:], boxes1[..., 2:])
 95 | 
 96 |         overlap_area = self.area_of(overlap_left_top, overlap_right_bottom)
 97 |         area0 = self.area_of(boxes0[..., :2], boxes0[..., 2:])
 98 |         area1 = self.area_of(boxes1[..., :2], boxes1[..., 2:])
 99 |         return overlap_area / (area0 + area1 - overlap_area + eps)
100 | 
101 |     def encode(self,
102 |                gt_boxes: torch.Tensor,
103 |                gt_labels: torch.Tensor,
104 |                iou_bg_threshold: float = 0.4,
105 |                iou_box_threshold: float = 0.5) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
106 |         """Assign ground truth boxes and targets to priors.
107 | 
108 |         Shape:
109 |             - gt_boxes: :math:`(num_targets, 4)`
110 |             - gt_labels: :math:`(num_targets)`
111 |             - boxes (out): :math:`(num_priors, 4)`
112 |             - labels (out): :math:`(num_priors)`
113 |             - mask (out): :math:`(num_priors)`
114 | 
115 |         Modified from assign_priors function of: https://github.com/lufficc/SSD
116 |         """
117 |         # If the image has no boxes, return the default
118 |         if gt_boxes.shape[0] == 0:
119 |             return self._no_box_encode()
120 | 
121 |         corner_form_priors = self.pboxes
122 | 
123 |         ious = self.iou_of(gt_boxes.unsqueeze(0), corner_form_priors.unsqueeze(1))
124 |         # size: num_priors
125 |         best_target_per_prior, best_target_per_prior_index = ious.max(1)
126 |         # size: num_targets
127 |         best_prior_per_target, best_prior_per_target_index = ious.max(0)
128 | 
129 |         for target_index, prior_index in enumerate(best_prior_per_target_index):
130 |             best_target_per_prior_index[prior_index] = target_index
131 | 
132 |         # size: num_priors
133 |         labels = gt_labels[best_target_per_prior_index]
134 | 
135 |         # 2.0 is used to make sure every target has a prior assigned
136 |         best_target_per_prior.index_fill_(0, best_prior_per_target_index, 2)
137 |         labels[best_target_per_prior < iou_bg_threshold] = 0  # the background id
138 |         boxes = gt_boxes[best_target_per_prior_index]
139 | 
140 |         # Add a mask to exclude certain priors within a threshold
141 |         bg_mask = best_target_per_prior < iou_bg_threshold
142 |         box_mask = best_target_per_prior > iou_box_threshold
143 |         mask = bg_mask | box_mask
144 | 
145 |         x = 0.5 * (boxes[:, 0] + boxes[:, 2])
146 |         y = 0.5 * (boxes[:, 1] + boxes[:, 3])
147 |         w = -boxes[:, 0] + boxes[:, 2]
148 |         h = -boxes[:, 1] + boxes[:, 3]
149 | 
150 |         boxes[:, 0] = x
151 |         boxes[:, 1] = y
152 |         boxes[:, 2] = w
153 |         boxes[:, 3] = h
154 | 
155 |         return boxes, labels, mask
156 | 
157 |     def _no_box_encode(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
158 |         """ Assign correct priors if there are no bounding boxes in the image
159 |         """
160 |         labels = torch.zeros(self.num_priors, dtype=torch.long)
161 |         mask = torch.ones(self.num_priors, dtype=torch.bool)
162 |         boxes = self.pboxes.clone()
163 |         # Transform format to xywh format
164 |         x, y, w, h = 0.5 * (boxes[:, 0] + boxes[:, 2]), \
165 |                      0.5 * (boxes[:, 1] + boxes[:, 3]), \
166 |                      -boxes[:, 0] + boxes[:, 2], \
167 |                      -boxes[:, 1] + boxes[:, 3]
168 |         boxes[:, 0] = x
169 |         boxes[:, 1] = y
170 |         boxes[:, 2] = w
171 |         boxes[:, 3] = h
172 | 
173 |         return boxes, labels, mask
174 | 
175 |     def transform_back_batch(self,
176 |                              boxes: torch.Tensor,
177 |                              scores: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
178 |         """ Bounding box format transformation from network output to format for non-max suppression
179 | 
180 |         Applies back variance_xy and variance_wh, permutes dims of boxes, switches from xywh to ltrb
181 |         and applies softmax to the scores
182 | 
183 |          Shape:
184 |             - boxes (input): :math:`(N, 4, num_priors)` where N is the batch_size
185 |             - scores (input): :math:`(N, num_labels, num_priors)`
186 |             - boxes (out): :math:`(N, num_priors, 4)`
187 |             - scores (out): :math:`(N, num_priors, num_labels)`
188 | 
189 |         Modified from: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Detection/SSD
190 |         """
191 | 
192 |         if boxes.device != self.pboxes.device:
193 |             self.pboxes = self.pboxes.to(boxes.device)
194 |             self.pboxes_xywh = self.pboxes_xywh.to(boxes.device)
195 | 
196 |         boxes = boxes.permute(0, 2, 1)
197 |         scores = scores.permute(0, 2, 1)
198 | 
199 |         boxes[:, :, :2] = self.variance_xy * boxes[:, :, :2]
200 |         boxes[:, :, 2:] = self.variance_wh * boxes[:, :, 2:]
201 | 
202 |         boxes[:, :, :2] = boxes[:, :, :2] * self.pboxes_xywh[:, :, 2:] + self.pboxes_xywh[:, :, :2]
203 |         boxes[:, :, 2:] = boxes[:, :, 2:].exp() * self.pboxes_xywh[:, :, 2:]
204 | 
205 |         # Transform format to ltrb
206 |         l, t, r, b = boxes[:, :, 0] - 0.5 * boxes[:, :, 2], \
207 |                      boxes[:, :, 1] - 0.5 * boxes[:, :, 3], \
208 |                      boxes[:, :, 0] + 0.5 * boxes[:, :, 2], \
209 |                      boxes[:, :, 1] + 0.5 * boxes[:, :, 3]
210 | 
211 |         boxes[:, :, 0] = l
212 |         boxes[:, :, 1] = t
213 |         boxes[:, :, 2] = r
214 |         boxes[:, :, 3] = b
215 | 
216 |         return boxes, F.softmax(scores, dim=-1)
217 | 
218 |     def decode_batch(self,
219 |                      boxes: torch.Tensor,
220 |                      scores: torch.Tensor,
221 |                      nms_iou_threshold: float = 0.3,
222 |                      max_output_num: int = 200) -> DecoderOutput:
223 |         """ Decode network prediction tensor and obtain bounding boxes location, label and confidence
224 | 
225 |         Shape:
226 |             - boxes (input): :math:`(N, 4, num_priors)` where N is the batch size
227 |             - scores (input): :math:`(N, num_labels, num_priors)`
228 |             - location (out): :math:`(M, 4)` where M is the number of detected boxes
229 |             - label (out):  :math:`(M)`
230 |             - confidence (out):  :math:`(M)`
231 | 
232 |         Output is a list of length `batch_size` containing tuples of location, label and confidence tensors
233 | 
234 |          Modified from: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Detection/SSD
235 |         """
236 |         bboxes, probs = self.transform_back_batch(boxes, scores)
237 | 
238 |         output = []
239 |         for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)):
240 |             bbox = bbox.squeeze(0)
241 |             prob = prob.squeeze(0)
242 |             output.append(self.decode_single(bbox, prob, nms_iou_threshold, max_output_num))
243 |         return output
244 | 
245 |     @staticmethod
246 |     def decode_single(boxes: torch.Tensor,
247 |                       scores: torch.Tensor,
248 |                       nms_iou_threshold: float,
249 |                       max_output: int,
250 |                       max_num: int = 200) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
251 |         """Performs non-maximum suppression and returns the decoded bounding boxes
252 | 
253 |         Shape:
254 |             - boxes (input): :math:`(num_priors, 4)`
255 |             - scores (input): :math:`(num_priors, num_labels)`
256 |             - location (out): :math:`(M, 4)` where M is the number of detected boxes
257 |             - label (out):  :math:`(M)`
258 |             - confidence (out):  :math:`(M)`
259 | 
260 |         Modified from: https://github.com/amdegroot/ssd.pytorch
261 |         """
262 |         boxes_out = []
263 |         scores_out = []
264 |         labels_out = []
265 | 
266 |         for i, score in enumerate(scores.split(1, 1)):
267 |             # skip background
268 |             if i == 0:
269 |                 continue
270 | 
271 |             score = score.squeeze(1)
272 |             mask = score > 0.05
273 | 
274 |             bboxes, score = boxes[mask, :], score[mask]
275 |             if score.shape[0] == 0:
276 |                 continue
277 | 
278 |             score_sorted, score_idx_sorted = score.sort(dim=0)
279 | 
280 |             # select max_output indices
281 |             score_idx_sorted = score_idx_sorted[-max_num:]
282 |             candidates = []
283 | 
284 |             while score_idx_sorted.numel() > 0:
285 |                 idx = score_idx_sorted[-1].item()
286 |                 bboxes_sorted = bboxes[score_idx_sorted, :]
287 |                 bboxes_idx = bboxes[idx, :].unsqueeze(dim=0)
288 |                 iou_sorted = iou(bboxes_sorted, bboxes_idx).squeeze()
289 |                 # we only need iou < criteria
290 |                 score_idx_sorted = score_idx_sorted[iou_sorted < nms_iou_threshold]
291 |                 candidates.append(idx)
292 | 
293 |             boxes_out.append(bboxes[candidates, :])
294 |             scores_out.append(score[candidates])
295 |             labels_out.extend([i] * len(candidates))
296 | 
297 |         if not boxes_out:
298 |             out = (torch.tensor([]), torch.tensor([]), torch.tensor([]))
299 |             return out
300 | 
301 |         boxes_out = torch.cat(boxes_out, dim=0)
302 |         labels_out = torch.tensor(labels_out, dtype=torch.long)
303 |         scores_out = torch.cat(scores_out, dim=0)
304 | 
305 |         _, max_ids = scores_out.sort(dim=0)
306 |         max_ids = max_ids[-max_output:]
307 | 
308 |         return boxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids]
309 | 


--------------------------------------------------------------------------------
/rock/model/losses.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, Optional, Tuple
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | import rock.ssd.prior_boxes
  7 | 
  8 | 
  9 | def normalize(x: torch.Tensor) -> torch.Tensor:
 10 |     """ Normalizes tensor for surface normals
 11 | 
 12 |    Shape:
 13 |         - X: :math:`(N, *)` where N is the batch size and * means, any number of additional dimensions ≥1
 14 |         - Output: :math:`(N, *)`, same size as the input
 15 | 
 16 |     """
 17 |     l2_norm = torch.clamp(torch.norm(x, dim=1, keepdim=True), min=1e-5)
 18 |     x = x / l2_norm
 19 |     return x
 20 | 
 21 | 
 22 | def reverse_huber_loss(x: torch.Tensor,
 23 |                        target: torch.Tensor) -> torch.Tensor:
 24 |     """ Computes the reverse huber loss in log space
 25 | 
 26 |     Shape:
 27 |         - X: :math:`(N, C, H, W)` where N is the batch size and C is the number of channels (1 for depth)
 28 |         - Target: :math:`(N, C, H, W)`
 29 |         - Output: scalar
 30 |     """
 31 |     # Reverse huber loss in log space
 32 |     x = torch.abs(x - torch.log(torch.clamp(target, min=1e-2, max=10)))
 33 | 
 34 |     # Fix c to a specific value (can be changed)
 35 |     c = 0.5
 36 |     gt_c = (x > c).float()
 37 |     le_c = (x <= c).float()
 38 | 
 39 |     linear = x * le_c
 40 |     quadratic = (((x ** 2) + (c ** 2)) / (2 * c)) * gt_c
 41 |     loss = linear + quadratic
 42 |     loss = torch.mean(loss)
 43 |     return loss
 44 | 
 45 | 
 46 | def huber_loss(x: torch.Tensor,
 47 |                target: torch.Tensor) -> torch.Tensor:
 48 |     """ Computes a Smooth-L1 loss (Huber loss) in log space
 49 | 
 50 |     Shape:
 51 |         - X: :math:`(N, C, H, W)` where N is the batch size and C is the number of channels (1 for depth)
 52 |         - Target: :math:`(N, C, H, W)`
 53 |         - Output: scalar
 54 |     """
 55 |     log_target = torch.log(torch.clamp(target, min=1e-3, max=10))
 56 |     smooth_l1 = nn.SmoothL1Loss(reduction='mean')
 57 |     loss = smooth_l1(x, log_target)
 58 |     return loss
 59 | 
 60 | 
 61 | def surface_normals_loss(x: torch.Tensor,
 62 |                          target: torch.Tensor,
 63 |                          mask: torch.Tensor) -> torch.Tensor:
 64 |     """ Computes the surface normals loss by combining the dot_product with the L2 loss
 65 | 
 66 |     Shape:
 67 |         - X: :math:`(N, C, H, W)` where N is the batch size and C is the number of channels (3 for normals)
 68 |         - Target: :math:`(N, C, H, W)`
 69 |         - Mask: :math:`(N, 1, H, W)`
 70 |         - Output: scalar
 71 |     """
 72 | 
 73 |     x = normalize(x)
 74 | 
 75 |     # Set the minimum dot product value to 0
 76 |     dot_product = -torch.sum(x * target, dim=1) + 1
 77 |     l2_loss = torch.sum((x - target) ** 2, dim=1)
 78 | 
 79 |     loss = (dot_product + l2_loss) * mask.squeeze(dim=1).float()
 80 |     mean = torch.sum(loss) / torch.sum(mask.float())
 81 |     return mean
 82 | 
 83 | 
 84 | def scene_cross_entropy_loss(scene_pred: torch.Tensor,
 85 |                              scene_gt: torch.Tensor) -> torch.Tensor:
 86 |     """ Computes the cross entropy loss for scenes where scene_pred is a log softmax input using NLL loss
 87 | 
 88 |     Shape:
 89 |         - scene_pred: :math:`(N, C)` where N is the batch size and C is the number of scene types in the dataset
 90 |         - scene_gt: :math:`(N)`
 91 |         - Output: scalar
 92 | 
 93 |     """
 94 |     cross_entropy_loss = nn.NLLLoss(reduction='mean')
 95 |     loss = cross_entropy_loss(scene_pred, scene_gt)
 96 | 
 97 |     return loss
 98 | 
 99 | 
100 | class MultiTaskLoss(nn.Module):
101 |     """ Implements the multi-task loss as the sum of the following:
102 |         1. Scene Loss: (weight 3)
103 |         2. Depth Loss: (weight 3)
104 |         3. Normals Loss: (weight 30)
105 |     """
106 | 
107 |     def __init__(self,
108 |                  aux_tasks: Tuple[str] = ('scene', 'depth', 'normals')) -> None:
109 |         super(MultiTaskLoss, self).__init__()
110 | 
111 |         self.scene = 'scene' in aux_tasks
112 |         self.depth = 'depth' in aux_tasks
113 |         self.normals = 'normals' in aux_tasks
114 | 
115 |         self.scene_weight = 3
116 |         self.depth_weight = 3
117 |         self.normals_weight = 3 * 10
118 | 
119 |     def forward(self,
120 |                 sample: Dict[str, torch.Tensor],
121 |                 loss_dict: Optional[Dict[str, float]] = None) -> torch.Tensor:
122 |         """ Forward method of Multi-task loss
123 | 
124 |         Keys of dict are: `scene_pred`, `scene_gt`, `depth_pred`,
125 |         `depth_gt`, `normals_pred`, `normals_gt`, `normals_mask`
126 | 
127 |        Shape:
128 |            - scene_pred: :math:`(N, num_scenes)` where N is the batch size & :math:`num_scenes` is the number of scenes
129 |            - scene_gt: :math:`(N)`
130 |            - depth_pred: :math:`(N, 1, H, W)` where :math:`H, W` is the height and width respectively
131 |            - depth_gt: :math:`(N, 1, H, W)`
132 |            - normals_pred: :math:`(N, 3, H, W)`
133 |            - normals_gt: :math:`(N, 3, H, W)`
134 |            - normals_mask: :math:`(N, 1, H, W)`
135 |            - Output: scalar
136 |         """
137 |         losses = []
138 | 
139 |         if self.scene:
140 |             scene_pred, scene_gt = sample['scene_pred'], sample['scene_gt']
141 |             scene_loss = scene_cross_entropy_loss(scene_pred, scene_gt)
142 |             losses.append(self.scene_weight * scene_loss)
143 | 
144 |             if loss_dict is not None:
145 |                 loss_dict['z_scene_loss'] += scene_loss.item()
146 | 
147 |         if self.depth:
148 |             depth_pred, depth_gt = sample['depth_pred'], sample['depth_gt']
149 |             depth_loss = huber_loss(depth_pred, depth_gt)
150 |             losses.append(self.depth_weight * depth_loss)
151 | 
152 |             if loss_dict is not None:
153 |                 loss_dict['z_depth_loss'] += depth_loss.item()
154 | 
155 |         if self.normals:
156 |             normals_pred, normals_gt, normals_mask = sample['normals_pred'], sample['normals_gt'], sample[
157 |                 'normals_mask']
158 |             normals_loss = surface_normals_loss(normals_pred, normals_gt, normals_mask)
159 |             losses.append(self.normals_weight * normals_loss)
160 | 
161 |             if loss_dict is not None:
162 |                 loss_dict['z_normals_loss'] += normals_loss.item()
163 | 
164 |         total_loss = torch.sum(torch.stack(losses))
165 | 
166 |         return total_loss
167 | 
168 | 
169 | class DetectionLoss(nn.Module):
170 |     """ Implements the detection loss as the sum of the following:
171 | 
172 |         1. Confidence Loss: All labels, with hard negative mining if using _hard_neg_mining_conf_loss \
173 |         or with all background labels if using _all_priors_conf_loss (weight 1)
174 | 
175 |         2. Localization Loss: Only on positive labels (weight 6)
176 | 
177 |         Suppose input pboxes has the shape 7228x4
178 | 
179 |         Modified from: https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Detection/SSD
180 |     """
181 | 
182 |     def __init__(self, pboxes: rock.ssd.prior_boxes.PriorBoxes, use_all_priors_conf_loss: bool = False) -> None:
183 |         super(DetectionLoss, self).__init__()
184 | 
185 |         self.huber_loss = nn.SmoothL1Loss(reduction='none')
186 |         self.cross_entropy_loss = nn.CrossEntropyLoss(reduction='none')
187 |         self.use_all_priors_conf_loss = use_all_priors_conf_loss
188 | 
189 |         self.loc_weight = 6
190 |         self.conf_weight = 1
191 | 
192 |         self.variance_xy = pboxes.variance_xy
193 |         self.variance_wh = pboxes.variance_wh
194 |         self.pboxes = nn.Parameter(pboxes(order="xywh").transpose(0, 1).unsqueeze(dim=0), requires_grad=False)
195 | 
196 |     def _loc_gt(self, loc: torch.Tensor) -> torch.Tensor:
197 |         """Generate Location Vectors
198 | 
199 |         Shape:
200 |             - loc: :math:`(N, 4, num_priors)` where N is the batch size
201 |             - Output: :math:`(N, 4, num_priors)`
202 |         """
203 | 
204 |         gxy = loc[:, :2, :] - self.pboxes[:, :2, :]
205 |         gxy = gxy / (self.variance_xy * self.pboxes[:, 2:, :])
206 | 
207 |         gwh = loc[:, 2:, :] / self.pboxes[:, 2:, :]
208 |         gwh = torch.log(gwh) / self.variance_wh
209 | 
210 |         return torch.cat((gxy, gwh), dim=1).contiguous()
211 | 
212 |     def _loc_loss(self,
213 |                   bbox_mask: torch.Tensor,
214 |                   bg_mask: torch.Tensor,
215 |                   gloc: torch.Tensor,
216 |                   ploc: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
217 |         """ Calculates the bounding box localization loss
218 | 
219 |         Returns the localization loss and the number of positive prior boxes per sample
220 |         """
221 |         # Loc loss
222 |         loc_gt = self._loc_gt(gloc)
223 |         loc_loss_mask = (bg_mask & bbox_mask).float()
224 |         pos_num = loc_loss_mask.sum(dim=1)
225 |         loc_loss = self.huber_loss(ploc, loc_gt).sum(dim=1)
226 |         loc_loss = (loc_loss_mask * loc_loss).sum(dim=1)
227 |         return loc_loss, pos_num
228 | 
229 |     def _hard_neg_mining_conf_loss(self, bbox_mask: torch.Tensor,
230 |                                    bg_mask: torch.Tensor,
231 |                                    glabel: torch.Tensor,
232 |                                    plabel: torch.Tensor,
233 |                                    pos_num: torch.Tensor) -> torch.Tensor:
234 |         """ Confidence loss using hard negative mining, used in both the SSD and ROCK paper
235 |         """
236 |         conf = self.cross_entropy_loss(plabel, glabel)
237 |         conf = conf * bbox_mask.float()
238 |         conf_neg = conf.clone()
239 |         conf_neg[bg_mask] = 0
240 |         _, conf_idx = conf_neg.sort(dim=1, descending=True)
241 |         _, conf_rank = conf_idx.sort(dim=1)
242 |         # number of negative three times positive
243 |         neg_num = torch.clamp(3 * pos_num, max=bg_mask.shape[1]).unsqueeze(-1)
244 |         neg_mask = (conf_rank < neg_num)
245 |         conf_loss_mask = (bg_mask | neg_mask).float()
246 |         conf_loss = (conf * conf_loss_mask).sum(dim=1)
247 |         return conf_loss
248 | 
249 |     def all_priors_conf_loss(self, bbox_mask, bg_mask, glabel, plabel, pos_num):
250 |         """ Confidence loss where all positive and negative priors are used, but weighted differently based on
251 |             the proportion of positive priors compared to negative priors
252 |         """
253 |         conf = self.cross_entropy_loss(plabel, glabel)
254 |         conf = conf * bbox_mask.float()
255 |         neg_num = (~bg_mask).float().sum(dim=1)
256 | 
257 |         # Multiply negative examples by the proportion of positive examples
258 |         neg_weight = pos_num / neg_num
259 |         # Multiply negative examples loss by 5 to speed up learning a bit, but not too much
260 |         conf_neg = 5 * neg_weight * (conf * (~bg_mask).float()).sum(dim=1)
261 |         conf_pos = (conf * bg_mask.float()).sum(dim=1)
262 | 
263 |         conf_loss = conf_pos + conf_neg
264 |         return conf_loss
265 | 
266 |     def forward(self,
267 |                 sample: Dict[str, torch.Tensor],
268 |                 loss_dict: Optional[Dict[str, float]] = None) -> torch.Tensor:
269 |         """ Forward method of the detection loss
270 | 
271 |         Keys of dict are: `ploc`, `plabel`, `bboxes`, `labels`, `bboxes_mask`
272 | 
273 |        Shape:
274 |            - ploc: :math:`(N, 4, num_priors)` where N is the batch size and :math:`num_priors` is the number of priors
275 |            - plabel: :math:`(N, num_labels, num_priors)` where :math:`num_labels` is the number of object labels
276 |            - gloc: :math:`(N, 4, num_priors)`
277 |            - glabel: :math:`(N, num_priors)`
278 |            - bboxes_mask: :math:`(N, num_priors)`
279 |            - Output: scalar
280 |         """
281 |         ploc = sample['ploc']
282 |         plabel = sample['plabel']
283 |         gloc = sample['bboxes']
284 |         glabel = sample['labels']
285 |         bbox_mask = sample['bboxes_mask']
286 | 
287 |         bg_mask = glabel > 0
288 | 
289 |         loc_loss, pos_num = self._loc_loss(bbox_mask, bg_mask, gloc, ploc)
290 | 
291 |         # Classification loss
292 | 
293 |         if self.use_all_priors_conf_loss:
294 |             # all priors loss
295 |             conf_loss = self.all_priors_conf_loss(bbox_mask, bg_mask, glabel, plabel, pos_num)
296 |         else:
297 |             # hard negative mining loss
298 |             conf_loss = self._hard_neg_mining_conf_loss(bbox_mask, bg_mask, glabel, plabel, pos_num)
299 | 
300 |         # Sum losses together
301 |         # Weight losses differently
302 |         total_loss = self.loc_weight * loc_loss + self.conf_weight * conf_loss
303 |         num_mask = (pos_num > 0).float()
304 |         pos_num = pos_num.float().clamp(min=1e-6)
305 |         total_loss = (total_loss * num_mask / pos_num).mean(dim=0)
306 | 
307 |         if loss_dict is not None:
308 |             loss_dict['z_loc_loss'] += (loc_loss * num_mask / pos_num).mean(dim=0).item()
309 |             loss_dict['z_conf_loss'] += (conf_loss * num_mask / pos_num).mean(dim=0).item()
310 | 
311 |         return total_loss
312 | 
313 | 
314 | class Loss(nn.Module):
315 |     """ Combines the detection loss with the multi-task loss, if existing
316 |     """
317 | 
318 |     def __init__(self, pboxes: rock.ssd.prior_boxes.PriorBoxes,
319 |                  auxiliary: bool = True,
320 |                  aux_tasks: Tuple[str] = ('scene', 'depth', 'normals'),
321 |                  use_all_priors_conf_loss: bool = False) -> None:
322 |         super(Loss, self).__init__()
323 | 
324 |         self.detection_loss = DetectionLoss(pboxes, use_all_priors_conf_loss=use_all_priors_conf_loss)
325 |         self.multi_task_loss = MultiTaskLoss(aux_tasks=aux_tasks)
326 | 
327 |         self.auxiliary = auxiliary
328 |         self.aux_tasks = aux_tasks
329 | 
330 |     def forward(self,
331 |                 sample: Dict[str, torch.Tensor],
332 |                 loss_dict: Dict[str, float] = None) -> torch.Tensor:
333 |         """ Forward method of the loss
334 | 
335 |                 Keys of dict are: `ploc`, `plabel`, `bboxes`, `labels`, `bboxes_mask`
336 |                 and if auxiliary is `True`, keys related to auxiliary tasks are:
337 |                 `scene_pred`, `scene_gt`, `depth_pred`, `depth_gt`, `normals_pred`, `normals_gt`, `normals_mask`
338 | 
339 |         Shape:
340 |             - Input: given in respective loss functions
341 |             - Output: scalar
342 |         """
343 | 
344 |         if self.auxiliary and self.aux_tasks:
345 |             detection_loss = self.detection_loss(sample, loss_dict)
346 |             multi_task_loss = self.multi_task_loss(sample, loss_dict)
347 |             loss = detection_loss + multi_task_loss
348 |         else:
349 |             loss = self.detection_loss(sample, loss_dict)
350 | 
351 |         return loss
352 | 


--------------------------------------------------------------------------------
/rock/datasets/transforms.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import random
  3 | from typing import Dict, Any, Optional, Tuple
  4 | 
  5 | import PIL
  6 | import torch
  7 | from PIL import Image
  8 | from torchvision import transforms as transforms
  9 | 
 10 | import rock.ssd.prior_boxes
 11 | from rock.ssd.encoder import Encoder
 12 | from rock.ssd.encoder import iou
 13 | 
 14 | 
 15 | class SSDCropping(object):
 16 |     """ Cropping for SSD, according to the original paper, but with fixed aspect ratios
 17 | 
 18 |     Randomly choose between the following 3 options:
 19 |     1. Keep the original image
 20 |     2. Random crop where minimum IoU is randomly chosen between 0.1, 0.3, 0.5, 0.7, 0.9
 21 |     3. Random crop
 22 |     Modified from https://github.com/chauhan-utk/ssd.DomainAdaptation/blob/master/utils/augmentations.py
 23 |     """
 24 | 
 25 |     def __init__(self, forced_crops: bool = False) -> None:
 26 |         """
 27 |         Args:
 28 |             forced_crops: force all images to be cropped (default: False)
 29 |         """
 30 | 
 31 |         self.sample_options = [
 32 |             # min IoU, max IoU
 33 |             (0.1, None),
 34 |             (0.3, None),
 35 |             (0.5, None),
 36 |             (0.7, None),
 37 |             (0.9, None),
 38 |             # no IoU requirements
 39 |             (None, None),
 40 |         ]
 41 | 
 42 |         if not forced_crops:
 43 |             # Add "do nothing"
 44 |             self.sample_options.append(None)
 45 |             # Make random cropping twice as likely as no cropping by adding a second (None, None) option
 46 |             # Increases classification task regularization on the ROCK dataset
 47 |             self.sample_options.append((None, None))
 48 | 
 49 |     def __call__(self,
 50 |                  sample: Dict[str, Any],
 51 |                  auxiliary: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
 52 |         """Crops the given sample
 53 | 
 54 |         Args:
 55 |             sample: input sample
 56 |             auxiliary:  Dict specifying info on input sample to be used for cropping the depth and normals map,
 57 |             None if no auxiliary features (default is None)
 58 | 
 59 |         Returns:
 60 |             random crop of the input sample with fixed aspect ratio
 61 |         """
 62 | 
 63 |         img = sample['img']
 64 |         img_size = sample['size']
 65 |         bboxes = sample['bboxes']
 66 |         labels = sample['labels']
 67 | 
 68 |         # Ensure always return cropped image
 69 |         while True:
 70 |             mode = random.choice(self.sample_options)
 71 | 
 72 |             # If no crop or box has no bboxes, don't crop
 73 |             if (mode is None) or (bboxes.shape[0] == 0):
 74 |                 return sample
 75 | 
 76 |             htot, wtot = img_size
 77 | 
 78 |             min_iou, max_iou = mode
 79 |             min_iou = float("-inf") if min_iou is None else min_iou
 80 |             max_iou = float("+inf") if max_iou is None else max_iou
 81 | 
 82 |             # Implementation uses 30 iterations to find a possible candidate
 83 |             for _ in range(30):
 84 | 
 85 |                 # area of each sampled crop uniformly distributed in[0.1, 1],
 86 |                 w = math.sqrt(random.uniform(0.1, 1.0))
 87 |                 h = w
 88 | 
 89 |                 left = random.uniform(0, 1.0 - w)
 90 |                 top = random.uniform(0, 1.0 - h)
 91 | 
 92 |                 right = left + w
 93 |                 bottom = top + h
 94 | 
 95 |                 ious = iou(bboxes, torch.tensor([[left, top, right, bottom]]))
 96 | 
 97 |                 # tailor all the bboxes and return
 98 |                 if not ((ious > min_iou) & (ious < max_iou)).all():
 99 |                     continue
100 | 
101 |                 # discard any bboxes whose center not in the cropped image
102 |                 xc = 0.5 * (bboxes[:, 0] + bboxes[:, 2])
103 |                 yc = 0.5 * (bboxes[:, 1] + bboxes[:, 3])
104 | 
105 |                 masks = (xc > left) & (xc < right) & (yc > top) & (yc < bottom)
106 | 
107 |                 # if no such boxes, continue searching again
108 |                 if not masks.any():
109 |                     continue
110 | 
111 |                 bboxes[bboxes[:, 0] < left, 0] = left
112 |                 bboxes[bboxes[:, 1] < top, 1] = top
113 |                 bboxes[bboxes[:, 2] > right, 2] = right
114 |                 bboxes[bboxes[:, 3] > bottom, 3] = bottom
115 | 
116 |                 bboxes = bboxes[masks, :]
117 |                 labels = labels[masks]
118 | 
119 |                 left_idx = int(left * wtot)
120 |                 top_idx = int(top * htot)
121 |                 right_idx = int(right * wtot)
122 |                 bottom_idx = int(bottom * htot)
123 |                 img = img.crop((left_idx, top_idx, right_idx, bottom_idx))
124 | 
125 |                 bboxes[:, 0] = (bboxes[:, 0] - left) / w
126 |                 bboxes[:, 1] = (bboxes[:, 1] - top) / h
127 |                 bboxes[:, 2] = (bboxes[:, 2] - left) / w
128 |                 bboxes[:, 3] = (bboxes[:, 3] - top) / h
129 | 
130 |                 htot = bottom_idx - top_idx
131 |                 wtot = right_idx - left_idx
132 | 
133 |                 if auxiliary is not None:
134 |                     ltrb = (left_idx, top_idx, right_idx, bottom_idx)
135 |                     auxiliary['scaling_factor'] = 1 / w
136 |                     sample = self._crop_multi_task(sample, ltrb)
137 | 
138 |                 sample['img'] = img
139 |                 sample['size'] = (htot, wtot)
140 |                 sample['bboxes'] = bboxes
141 |                 sample['labels'] = labels
142 | 
143 |                 return sample
144 | 
145 |     @staticmethod
146 |     def _crop_multi_task(sample, ltrb):
147 |         if sample['depth']:
148 |             sample['depth'] = sample['depth'].crop(ltrb)
149 | 
150 |         if sample['normals'] and sample['normals_mask']:
151 |             sample['normals'] = sample['normals'].crop(ltrb)
152 |             sample['normals_mask'] = sample['normals_mask'].crop(ltrb)
153 | 
154 |         return sample
155 | 
156 | 
157 | class RandomHorizontalFlip(object):
158 |     """Horizontally flips a sample with a given probability
159 |     """
160 | 
161 |     def __init__(self, p: float = 0.5) -> None:
162 |         """
163 |         Args:
164 |             p: flip probability
165 |         """
166 | 
167 |         self.p = p
168 | 
169 |     def __call__(self,
170 |                  sample: Dict[str, Any],
171 |                  auxiliary: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
172 |         """
173 |         Args:
174 |             sample (Dict[str, Any]): input sample
175 |             auxiliary (Optional[Dict[str, Any]]): None if no auxiliary features, Dict specifying info on input
176 |                 sample otherwise, to be used for flipping the depth and normals map
177 | 
178 |         Returns:
179 |             (Dict[str, Any]): horizontal flip of the input sample
180 | 
181 |         """
182 | 
183 |         img = sample['img']
184 |         bboxes = sample['bboxes']
185 | 
186 |         if random.random() < self.p:
187 | 
188 |             # Make sure to only flip boxes if they exist
189 |             if bboxes.shape[0] != 0:
190 |                 bboxes[:, 0], bboxes[:, 2] = 1.0 - bboxes[:, 2], 1.0 - bboxes[:, 0]
191 | 
192 |             img = img.transpose(Image.FLIP_LEFT_RIGHT)
193 | 
194 |             if auxiliary is not None:
195 |                 auxiliary['flip'] = True
196 |                 sample = self._flip_multi_task(sample)
197 | 
198 |         sample['img'] = img
199 |         sample['bboxes'] = bboxes
200 |         return sample
201 | 
202 |     @staticmethod
203 |     def _flip_multi_task(sample):
204 |         sample['depth'] = sample['depth'].transpose(Image.FLIP_LEFT_RIGHT)
205 |         sample['normals'] = sample['normals'].transpose(Image.FLIP_LEFT_RIGHT)
206 |         sample['normals_mask'] = sample['normals_mask'].transpose(Image.FLIP_LEFT_RIGHT)
207 | 
208 |         return sample
209 | 
210 | 
211 | class DepthTrans(object):
212 |     """Transforms the depth map from a PIL Image to a Tensor matching the size of the backbone output feature map
213 |     """
214 | 
215 |     def __init__(self,
216 |                  size: Tuple[int, int] = (480, 640),
217 |                  scale: float = 16) -> None:
218 |         """
219 |         Args:
220 |             size: size of the backbone input image
221 |             scale: scaling of the backbone
222 |         """
223 |         self.size = (round(size[0] / scale), round(size[1] / scale))
224 | 
225 |     def __call__(self,
226 |                  depth: PIL.Image,
227 |                  auxiliary: Dict[str, Any]) -> torch.Tensor:
228 |         """
229 |         Args:
230 |             depth: depth image
231 |             auxiliary: dictionary containing info on the auxiliary features
232 | 
233 |         Returns:
234 |             resized tensor
235 | 
236 |         """
237 |         trans = transforms.Compose([
238 |             transforms.Resize(self.size, Image.NEAREST),
239 |             transforms.ToTensor()])
240 | 
241 |         depth = trans(depth)
242 |         depth = depth / auxiliary['scaling_factor']
243 | 
244 |         return depth
245 | 
246 | 
247 | class NormalsTrans(object):
248 |     """Transforms the normals and normals mask
249 | 
250 |     Transforms the normals and normals mask from PIL Images to normalized Tensors matching the size of
251 |     the backbone output feature map
252 |     """
253 | 
254 |     def __init__(self,
255 |                  size: Tuple[int, int] = (480, 640),
256 |                  scale: float = 16) -> None:
257 |         """
258 |         Args:
259 |             size (Tuple[int, int]): size of the backbone input image
260 |             scale (int): scaling of the backbone
261 |         """
262 |         self.size = (round(size[0] / scale), round(size[1] / scale))
263 | 
264 |     @staticmethod
265 |     def normalize(x):
266 |         """Normalizes the normals
267 | 
268 |         Args:
269 |             x (tensor): input tensor
270 | 
271 |         Returns:
272 |             (tensor): normalized tensor
273 | 
274 |         """
275 |         l2_norm = torch.clamp(torch.norm(x, dim=0), min=1e-5)
276 |         x = x / l2_norm
277 |         return x
278 | 
279 |     def __call__(self,
280 |                  normals: PIL.Image,
281 |                  normals_mask: PIL.Image,
282 |                  auxiliary: Dict[str, Any]) -> Tuple[torch.Tensor, torch.Tensor]:
283 |         """
284 |         Args:
285 |             normals: normals image
286 |             normals_mask: 1-channel normals mask image
287 |             auxiliary: dictionary containing info on the auxiliary features
288 | 
289 |         Returns:
290 |             Tuple[torch.Tensor, torch.Tensor]: tuple containing the transformed normals and normals_mask tensors
291 | 
292 |         """
293 | 
294 |         trans = transforms.Compose([
295 |             transforms.Resize(self.size, Image.NEAREST),
296 |             transforms.ToTensor()])
297 | 
298 |         mean = 0.5
299 |         normals = trans(normals) - mean
300 |         normals_mask = trans(normals_mask).type(torch.bool)
301 | 
302 |         if auxiliary['flip']:
303 |             normals[0, :, :] = -normals[0, :, :]
304 | 
305 |         normals = self.normalize(normals)
306 | 
307 |         if auxiliary['scaling_factor'] != 1:
308 |             normals[2, :, :] = normals[2, :, :] * auxiliary['scaling_factor']
309 |             normals = self.normalize(normals)
310 | 
311 |         return normals, normals_mask
312 | 
313 | 
314 | class Transformer(object):
315 |     """ Transform input sample into dict of tensors, and optionally perform data augmentation
316 |     """
317 | 
318 |     def __init__(self,
319 |                  pboxes: rock.ssd.prior_boxes.PriorBoxes,
320 |                  size: Tuple[int, int] = (480, 640),
321 |                  train: bool = True,
322 |                  forced_crops: bool = False) -> None:
323 |         """
324 |         Args:
325 |             pboxes: prior boxes
326 |             size: input image size (default: (480, 640))
327 |             train: indicate whether to apply train transformations or not (default: True)
328 |             forced_crops: force all train images to be cropped (default: False)
329 |         """
330 | 
331 |         self.size = size
332 |         self.train = train
333 | 
334 |         self.pboxes = pboxes
335 |         self.encoder = Encoder(self.pboxes)
336 | 
337 |         self.crop = SSDCropping(forced_crops)
338 |         self.hflip = RandomHorizontalFlip()
339 | 
340 |         self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
341 |                                               std=[0.229, 0.224, 0.225])
342 | 
343 |         self.img_train_trans = transforms.Compose([
344 |             transforms.Resize(self.size),
345 |             transforms.ColorJitter(brightness=0.15, contrast=0.25,
346 |                                    saturation=0.25, hue=0.05),
347 |             transforms.ToTensor(),
348 |             self.normalize
349 |         ])
350 | 
351 |         self.depth_trans = DepthTrans(size=self.size, scale=self.pboxes.scale_change())
352 |         self.normals_trans = NormalsTrans(size=self.size, scale=self.pboxes.scale_change())
353 | 
354 |         self.img_test_trans = transforms.Compose([
355 |             transforms.Resize(self.size),
356 |             transforms.ToTensor(),
357 |             self.normalize])
358 | 
359 |     def __call__(self, sample: Dict[str, Any]) -> Dict[str, Any]:
360 |         """
361 |         Performs data augmentation on the image
362 | 
363 |         Keys of dict are: `img`, `img_id`, `size`, `bboxes`, `labels`, `bboxes_mask`, `scene_id`,
364 |         `depth`, `normals`, `normals_mask`, `auxiliary`
365 | 
366 |         Input types are:
367 |             - `img` (PIL.Image)
368 |             - `img_id` (int)
369 |             - `size` (Tuple[int, int])
370 |             - `bboxes` (torch.Tensor) Shape: :math:`(num_targets, 4)`
371 |             - `labels` (torch.Tensor) Shape: :math:`(num_targets)`
372 |             - `bboxes_mask` (None)
373 |             - `scene_id` (int)
374 |             - `depth` (PIL.Image)
375 |             - `normals` (PIL.Image)
376 |             - `normals_mask` (PIL.Image)
377 |             - `auxiliary` (bool)
378 |         |
379 |         Output types are:
380 |             - `img` (torch.Tensor) Shape: :math:`(3, H_img, W_img)`
381 |             - `img_id` (int)
382 |             - `size` (Tuple[int, int])
383 |             - `bboxes` (torch.Tensor) Shape: :math:`(4, num_priors)`
384 |             - `labels` (torch.Tensor) Shape: :math:`(num_priors)`
385 |             - `bboxes_mask` (torch.BoolTensor) Shape: :math:`(num_priors)`
386 |             - `scene_id` (int)
387 |             - `depth` (torch.Tensor) Shape: :math:`(1, H_featuremap, W_featuremap)`
388 |             - `normals` (torch.Tensor) Shape: :math:`(3, H_featuremap, W_featuremap)`
389 |             - `normals_mask` (torch.BoolTensor) Shape: :math:`(1, H_featuremap, W_featuremap)`
390 |             - `auxiliary` (bool)
391 | 
392 |         Args:
393 |             sample: dictionary containing the image, bounding boxes, labels, bounding boxes masks,
394 |                 depth map, normals and normals mask of a sample from the NYUv2 dataset
395 | 
396 |         Returns:
397 |             input sample, with data augmentation and transformation applied
398 | 
399 |         """
400 |         s = sample
401 | 
402 |         if s['auxiliary']:
403 |             auxiliary = {'flip': False, 'scaling_factor': 1}
404 |         else:
405 |             auxiliary = None
406 | 
407 |         # Train transform
408 |         if self.train:
409 |             # Crop and flip
410 |             s = self.crop(s, auxiliary=auxiliary)
411 |             s = self.hflip(s, auxiliary=auxiliary)
412 |             # Apply training image transform
413 |             s['img'] = self.img_train_trans(s['img']).contiguous()
414 | 
415 |         # Test transform
416 |         else:
417 |             # Apply test image transform
418 |             s['img'] = self.img_test_trans(s['img']).contiguous()
419 | 
420 |         # Encode the bounding boxes
421 |         s['bboxes'], s['labels'], s['bboxes_mask'] = self.encoder.encode(s['bboxes'], s['labels'])
422 |         # permute to get bboxes in the correct format (4, num_priors) instead of (num_priors, 4)
423 |         s['bboxes'] = s['bboxes'].permute(1, 0)
424 | 
425 |         # Transform the auxiliary tasks
426 |         if s['auxiliary']:
427 |             if s['depth']:
428 |                 s['depth'] = self.depth_trans(s['depth'], auxiliary)
429 | 
430 |             if s['normals'] and s['normals_mask']:
431 |                 s['normals'], s['normals_mask'] = self.normals_trans(s['normals'], s['normals_mask'], auxiliary)
432 | 
433 |         return s
434 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # rock-pytorch
  2 | 
  3 | 
  4 | A [PyTorch](https://pytorch.org/) implementation of [ROCK](http://papers.neurips.cc/paper/7406-revisiting-multi-task-learning-with-rock-a-deep-residual-auxiliary-block-for-visual-detection) from the NeurIPS 2018 paper by Taylor Mordan, Nicolas Thome, Gilles Henaff, and Matthieu Cord.
  5 | 
  6 | #### Abstract
  7 | > Revisiting Multi-Task Learning with ROCK: a Deep Residual Auxiliary Block for Visual Detection
  8 | >
  9 | >Multi-Task Learning (MTL) is appealing for deep learning regularization. In this
 10 | >paper, we tackle a specific MTL context denoted as primary MTL, where the ultimate goal is to
 11 | >improve the performance of a given primary task by leveraging
 12 | >several other auxiliary tasks. Our main methodological contribution is to introduce
 13 | >ROCK, a new generic multi-modal fusion block for deep learning tailored to the
 14 | >primary MTL context. ROCK architecture is based on a residual connection, which
 15 | >makes forward prediction explicitly impacted by the intermediate auxiliary representations.
 16 | >The auxiliary predictor’s architecture is also specifically designed to
 17 | >our primary MTL context, by incorporating intensive pooling operators for
 18 | >maximizing complementarity of intermediate representations. Extensive experiments
 19 | >on NYUv2 dataset (object detection with scene classification, depth prediction,
 20 | >and surface normal estimation as auxiliary tasks) validate the relevance of the
 21 | >approach and its superiority to flat MTL approaches. Our method outperforms
 22 | >state-of-the-art object detection models on NYUv2 dataset by a large margin, and
 23 | >is also able to handle large-scale heterogeneous inputs (real and synthetic images)
 24 | >with missing annotation modalities.
 25 | 
 26 | ![rock_schema](docs/rock_schema.png)
 27 | 
 28 | 
 29 | ### Table of Contents
 30 | - [Installation](#installation)
 31 | - [Interfaces](#interfaces)
 32 | - [Preparing the data](#preparing-the-data)
 33 | - [Training](#training)
 34 | - [Evaluation](#evaluation)
 35 | - [Detection](#detection)
 36 | - [Changes from paper](#changes-from-paper)
 37 | - [Performance](#performance)
 38 | - [Project structure](#project-structure)
 39 | - [References](#references)
 40 | - [Citation](#citation)
 41 | 
 42 | 
 43 | ## Installation
 44 | 
 45 | ```
 46 | # To clone the repository using HTTPS
 47 | git clone https://github.com/vita-epfl/rock-pytorch.git
 48 | cd rock-pytorch/
 49 | ```
 50 | 
 51 | This project has been tested with python==3.7 and pytorch==1.5.0. All required packages can be found in the `requirements.txt` file.
 52 | 
 53 | <b> Note </b>: the pip and conda versions of `pycocotools` are out-of-date and incompatible with `numpy 1.18` or above. To install an up-to-date version of `pycocotools`, run:
 54 | ```
 55 | pip install cython; pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
 56 | ```
 57 | 
 58 | 
 59 | ### Datasets
 60 | 
 61 | The [NYU Depth Dataset V2 dataset](https://cs.nyu.edu/~silberman/datasets/nyu_depth_v2.html) is used to train and evaluate this network.
 62 | 
 63 | Download links:
 64 | - Dataset containing the images, depths, labels, scenes and instances: http://horatio.cs.nyu.edu/mit/silberman/nyu_depth_v2/nyu_depth_v2_labeled.mat
 65 | - Train / test split: http://horatio.cs.nyu.edu/mit/silberman/indoor_seg_sup/splits.mat
 66 | - Surface normals and masks: https://cs.nyu.edu/~deigen/dnl/normals_gt.tgz.
 67 | - Train / val split: https://github.com/vita-epfl/rock-pytorch/releases/download/v0.1/val_split.txt
 68 | 
 69 | 
 70 | #### Download the dataset from the command line:
 71 | ```
 72 | mkdir data
 73 | cd data/
 74 | 
 75 | wget http://horatio.cs.nyu.edu/mit/silberman/nyu_depth_v2/nyu_depth_v2_labeled.mat
 76 | 
 77 | wget http://horatio.cs.nyu.edu/mit/silberman/indoor_seg_sup/splits.mat
 78 | 
 79 | wget https://cs.nyu.edu/~deigen/dnl/normals_gt.tgz
 80 | tar xzf normals_gt.tgz
 81 | 
 82 | wget https://github.com/vita-epfl/rock-pytorch/releases/download/v0.1/val_split.txt
 83 | 
 84 | cd ..
 85 | ```
 86 | 
 87 | ## Interfaces  
 88 | 
 89 | All the commands can be run through a main file called `run.py` using subparsers. To check all the possible commands, run:
 90 | - `python3 -m rock.run -h`
 91 | - `python3 -m rock.run prep -h`
 92 | - `python3 -m rock.run train -h`
 93 | - `python3 -m rock.run eval -h`
 94 | - `python3 -m rock.run create_image_folder -h`
 95 | - `python3 -m rock.run detect -h`
 96 | 
 97 | or check the file: `rock/run.py`.
 98 | 
 99 | If the datasets are located as indicated in the [project structure](#project-structure), these commands can be run without the need for optional arguments specifying the path.
100 | 
101 | To train / evaluate / detect images using a baseline SSD, instead of a SSD with the ROCK block, add the `--no_rock` argument to the command.
102 | 
103 | ## Preparing the data
104 | 
105 | As the NYUv2 dataset does not contain object bounding boxes, some pre-processing is needed to add the bounding boxes, format the data in a suitable way for training and create the training / testing (and optionally validation) sets.
106 | To do so, run:
107 | ```
108 | python3 -m rock.run prep
109 | ```
110 | with the appropriate optional arguments, if needed.
111 | 
112 | <b> Note: </b>
113 | The validation set, if added using the `--val_split_path` argument, is extracted from the training set.
114 | It is therefore important to differentiate between the training set obtained without validation data (which contains the training set in its entirety), and the training set obtained with validation data (from which the images of the validation set have been removed), and it is recommended to save these two datasets under different paths.   
115 | An example file structure for the datasets can be found in the [project structure section](#project-structure).
116 | 
117 | 
118 | ## Training
119 | To train the model, run:
120 | 
121 | ```
122 | python3 -m rock.run train
123 | ```
124 | with the appropriate optional arguments, if needed.
125 | 
126 | The optional arguments can be used to:
127 |   - specify the dataset
128 |   - change the hyper-parameters
129 |   - change the model architecture
130 |   - resume training from a checkpoint
131 |   - specify the model save path and saving conditions
132 |   - specify the evaluation frequency
133 |   - visualize the training with [TensorBoard](https://www.tensorflow.org/tensorboard/)
134 | 
135 | The default values of hyperparameter arguments are the ones used to obtain the first two results described in the [performance](#performance) section (Baseline SSD and ROCK trained on train set, and evaluated on test set).
136 | 
137 | Here are some ways in which training can be changed using optional arguments:
138 | #### Model architecture
139 | ##### SSD with ROCK
140 | By default, training implements the ROCK block, which is trained on 3 auxiliary tasks: scene, depth and surface normal prediction.  
141 | 
142 | Any of these auxiliary tasks can be disabled during training using the `--aux_tasks` argument.
143 | 
144 | ##### Baseline SSD
145 | As ROCK is implemented on top of a [Single Shot Detector](https://arxiv.org/abs/1512.02325), this repository can also be used for SSD implementations requiring non-square images.  
146 | 
147 | To disable the ROCK block altogether and obtain a baseline SSD, use the `--no_rock` argument.
148 | 
149 | 
150 | #### Resuming training from a checkpoint
151 | Use the `--checkpoint_path path/to/checkpoint` argument to resume training from a checkpoint.
152 | 
153 | #### Saving the model
154 | 
155 | Use the `--save_path path/to/folder` to specify the path in which the model weights will be saved.
156 | 
157 | By default, the model weights are only saved once training is completed.  
158 | Use the `--save_best_on_val` argument to save the model with the best mAP on the validation data, and the `--model_save_freq num_epochs` argument to save the model every `num_epochs`.
159 | 
160 | #### Visualization using TensorBoard
161 | 
162 | If [TensorBoard](https://www.tensorflow.org/tensorboard/) is installed, metrics (such as all training and validation losses, and mAP), as well as training images can be tracked and visualized by adding the optional argument `--writer_path path/to/folder`.
163 | 
164 | <b>Note:</b>  
165 | To launch Tensorboard, run: `tensorboard --logdir=path/to/logdir`
166 | 
167 | ###### Detections during training using TensorBoard
168 | ![tensorboard_training_detections](docs/tensorboard_training_detections.png)
169 | 
170 | 
171 | ## Evaluation
172 | To evaluate the model with [COCOeval](http://cocodataset.org/#home) and obtain the mean Average Precision (mAP), run:
173 | ```
174 | python3 -m rock.run eval [model_path]
175 | ```
176 | with the appropriate optional arguments if needed, and where `[model_path]` is the path to a trained model. Add the `--no_rock` argument if you are evaluating a baseline SSD (without the ROCK block).
177 | 
178 | The `--show_all_cats` argument can be used to show the mAP per object category.
179 | 
180 | For evaluation, ground-truth and network output JSON files are created in `data/eval` in a format similar to the ones of the COCO dataset, which are then used by COCOeval to calculate the mAP.
181 | 
182 | ## Detection
183 | To run object detection on a trained model, a folder with images is required:
184 | - run `python3 -m rock.run create_image_folder` with the appropriate optional arguments if needed, to create a folder containing only the images from a pre-processed NYUv2 dataset (created using `rock.run prep`).
185 | - or manually create your own image folder. These images can be of any size, as they get automatically cropped and resized before object detection.
186 | 
187 | Once an image folder is created, run
188 | ```
189 | python3 -m rock.run detect [model_path]
190 | ```
191 | with the appropriate optional arguments if needed, and where `[model_path]` is the path to a trained model. Add the `--no_rock` argument if you are running detection using a baseline SSD (without the ROCK block).
192 | 
193 | The output images are saved in the specified folder (default: `data/detection/output`) with the same names as the input images.
194 | 
195 | ###### Image detection with ROCK
196 | <p float="left">
197 |     <img src="docs/rock_0170.png" alt="rock_0170" width="45%"/>
198 |     <img src="docs/rock_1150.png" alt="rock_1150" width="45%"/>
199 | </p>
200 | 
201 | #### Scene, depth and surface normals
202 | When running detection on a model with the ROCK block, the `--scene_output_path`, `--depth_output_path` and `--normals_output_path` arguments can be added to save the scene, depth and surface normal predictions of each image.
203 | 
204 | #### Throughput
205 | The `--get_throughput` argument can be added to obtain the model throughput (in images/sec). If this argument is added, drawing and saving the output images to a folder are disabled as these operations add a very significant time overhead, and the batch size is set to 1.
206 | 
207 | ## Changes from paper
208 | 
209 | - Extra convolutional layers were added to each auxiliary task (between the encoder and the predictor), improving their performance (especially for the depth and scene prediction tasks), which in turn improves the mAP compared to a model without these additional convolutional layers.
210 | The convolutional layers added to each auxiliary task are the following:
211 |     * 1x1 convolution from 512 to 256 channels
212 |     * 3x3 convolution from 256 to 256 channels
213 |     * 1x1 convolution from 256 to 512 channels
214 | 
215 | 
216 | - Reverse Huber loss (berHu loss) used for depth prediction was changed to a Huber loss. The berHu loss, introduced in this [paper](https://arxiv.org/abs/1606.00373), causes one pixel of the depth prediction to be extremely inaccurate, which is not desirable. This is a consequence of the threshold between linear and quadratic space being set by the maximal per-batch error.
217 | 
218 | - The weight for the localization loss was changed from 3 to 6 to compensate for the difference in complexity between the classification and localization task, resulting in a slightly improved mAP.
219 | 
220 | - 5 additional SSD detection layers are added to the refined feature map, instead of 6 in the original paper. Furthermore, the scale of the prior boxes (set by smin and smax) differs from the ones given in the [SSD paper](https://arxiv.org/abs/1512.02325) to accommodate for that change. The prior boxes scale used here is commonly used in other SSD implementations as well.
221 | 
222 | - A different kind of loss was tested for the classification task. Unlike the confidence loss used in both the original SSD paper and ROCK paper, which only picks the top negative examples when sorted by loss so that the ratio between negatives and positives is at most 3:1, this loss takes into account all negative examples, and weighs them relative to the amount of positive examples in each sample to limit imbalance.  
223 | This loss yields a better mean average precision when trained for the amount of iterations indicated in the paper (21.9% mAP@[0.50:0.95] instead of 19.9% when tested on the validation set).   
224 | It is not active by default, but can be set by adding the `--use_all_priors_conf_loss` argument during training.
225 | 
226 | ## Performance
227 | 
228 | ### NYU Depth v2
229 | 
230 | #### Performance on test set compared to baseline SSD
231 | 
232 | Trained on train set (795 images), evaluated on test set (654 images).
233 | 
234 | Model trained on a NVIDIA RTX 2080 Ti GPU, with:
235 | - `--num_iters 30_000`
236 | - `--lr 5e-5`
237 | - `--scheduler_milestones 25_000`
238 | - `--scheduler_gamma 0.1`
239 | 
240 | 
241 | Model           | mAP@[0.50:0.95]   | mAP@[0.50]    | Training time     | Throughput        |
242 | ----------------|-------------------|---------------|-------------------|-------------------|
243 | Baseline SSD    | 18.5%             | 35.3%         | <b>2.9 hours</b>  | <b>65.9 FPS</b>   |
244 | ROCK            | <b>20.6%</b>      | <b>39.9%</b>  | 3.3 hours         | 55.8 FPS          |
245 | 
246 | The weights for these models can be downloaded here:
247 | - [Baseline SSD](https://github.com/vita-epfl/rock-pytorch/releases/download/v0.1/baseline_ssd_trained.pt) (480 MB)
248 | - [ROCK](https://github.com/vita-epfl/rock-pytorch/releases/download/v0.1/rock_trained.pt) (565 MB)
249 | 
250 | 
251 | ###### Comparison of detected images between the baseline SSD (left) and ROCK (right)
252 | <p float="left">
253 |     <img src="docs/ssd_1302.png" alt="ssd_1302" width="45.3%"/>
254 |     <img src="docs/rock_1302.png" alt="rock_1302" width="42%"/>
255 | </p>
256 | 
257 | <p float="left">
258 |     <img src="docs/ssd_0332.png" alt="ssd_0332" width="45%"/>
259 |     <img src="docs/rock_0332.png" alt="rock_0332" width="45%"/>
260 | </p>
261 | 
262 | <p float="left">
263 |     <img src="docs/ssd_0992.png" alt="ssd_0992" width="41.8%"/>
264 |     <img src="docs/rock_0992.png" alt="rock_0992" width="47%"/>
265 | </p>
266 | 
267 | 
268 | #### Comparison with results from paper
269 | In the paper, the training set was split into a new train / val set.  
270 | Trained on the new train set (673 images), evaluated on val set (122 images), with the same number of iterations and the same learning rate as the model described in the paper.
271 | - `--num_iters 40_000`
272 | - `--lr=5e-5`
273 | - `--scheduler_milestones 30_000`
274 | - `--scheduler_gamma=0.1`
275 | 
276 | 
277 | Model         | mAP@[0.50:0.95] | mAP@[0.50]    |
278 | --------------|-----------------|---------------|
279 | Original      | 18.5%           | 37.6%         |
280 | rock-pytorch  | 19.9%           | 36.2%         |
281 | 
282 | 
283 | ## Project structure
284 | 
285 | #### Code structure:
286 | ```
287 | rock/
288 | ├── datasets/
289 | ├── model/      # backbone, ROCK block (auxiliary.py), SSD detection layer & losses
290 | ├── ssd/        # prior boxes and SSD encoder
291 | ├── utils/
292 | └── (+ files for preprocessing, training, evaluating and detection)
293 | ```
294 | 
295 | #### Data structure:
296 | Here is the default way in which files are organized in the data folder (if no optional arguments to change the path are used).  
297 | Using this data structure minimizes the need to use arguments for commands, although the file structure can be modified, and the paths to folders specified using optional arguments for all commands.
298 | 
299 | ```
300 | data/
301 | ├── train_test/
302 | │   ├── nyuv2_train/    # 795 images
303 | │   └── nyuv2_test/     # 654 images
304 | ├── train_val_test/
305 | │   ├── nyuv2_train/    # 673 images
306 | │   ├── nyuv2_val/      # 122 images
307 | │   └── nyuv2_test/     # 654 images
308 | ├── runs/
309 | ├── models/
310 | ├── detection/
311 | │   ├── images/
312 | │   ├── output/
313 | │   ├── scene_output/
314 | │   ├── depth_output/
315 | │   └── normals_output/
316 | ├── tensorboard_images/
317 | ├── eval/   # folder containg JSON files for COCOeval
318 | └── (+ extra files and folders containing the raw downloaded data)
319 | ```
320 | 
321 | ## References
322 | - Taylor Mordan, et al., ["Revisiting Multi-Task Learning with ROCK: a Deep Residual Auxiliary Block for Visual Detection"](http://papers.neurips.cc/paper/7406-revisiting-multi-task-learning-with-rock-a-deep-residual-auxiliary-block-for-visual-detection), NeurIPS 2018
323 | - Wei Liu, et al., ["SSD: Single Shot MultiBox Detector"](https://arxiv.org/abs/1512.02325), ECCV 2016.
324 | - Kaiming He, et al., ["Deep Residual Learning for Image Recognition"](https://arxiv.org/abs/1512.03385), CVPR 2016
325 | - Nathan Silberman, et al., ["NYU Depth Dataset v2"](https://cs.nyu.edu/~silberman/datasets/nyu_depth_v2.html), ECCV 2012
326 | 
327 | - The SSD implemented in this project has been heavily influenced by these great SSD implementations:
328 |   * [PyTorch SSD by NVIDIA](https://pytorch.org/hub/nvidia_deeplearningexamples_ssd/)
329 |   * [ssd.pytorch by amdegroot](https://github.com/amdegroot/ssd.pytorch)
330 |   * [SSD by lufficc](https://github.com/lufficc/SSD)
331 |   * [pytorch-ssd by kuangliu](https://github.com/kuangliu/pytorch-ssd)
332 |   * [ssd.DomainAdaptation by chautan-utk](https://github.com/chauhan-utk/ssd.DomainAdaptation)
333 | 
334 | ## Citation
335 | If you use this project in your research, please cite the corresponding paper:
336 | ```text
337 | @inproceedings{mordan2018revisiting,
338 |   title={Revisiting Multi-Task Learning with {ROCK}: a Deep Residual Auxiliary Block for Visual Detection},
339 |   author={Mordan, Taylor and Thome, Nicolas and Henaff, Gilles and Cord, Matthieu},
340 |   booktitle={Advances in Neural Information Processing Systems (NeurIPS)},
341 |   pages={1310--1322},
342 |   year={2018}
343 | }
344 | ```
345 | This project was made by David Mizrahi at EPFL/VITA.
346 | 


--------------------------------------------------------------------------------
/rock/run.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from typing import Union
  3 | 
  4 | import torch
  5 | 
  6 | 
  7 | def int_or_none(value: str) -> Union[None, int]:
  8 |     if value == 'None':
  9 |         return None
 10 |     return int(value)
 11 | 
 12 | 
 13 | def cli() -> argparse.Namespace:
 14 |     """ Command line interface
 15 |     """
 16 |     parser = argparse.ArgumentParser()
 17 |     subparsers = parser.add_subparsers(help='Different parsers for main actions', dest='command')
 18 | 
 19 |     prep_epilog = 'Note: If no val_split_path is provided, this command creates a train/test set. ' \
 20 |                   'Otherwise, this command creates a train/val/test set, ' \
 21 |                   'with the val set extracted from the test set. ' \
 22 |                   'It is recommended to change the default train_save_path and test_save_path when adding a val set.'
 23 |     prep_parser = subparsers.add_parser("prep",
 24 |                                         help='preprocess the NYUv2 dataset',
 25 |                                         epilog=prep_epilog)
 26 | 
 27 |     train_parser = subparsers.add_parser("train",
 28 |                                          help='train the ROCK network')
 29 | 
 30 |     eval_parser = subparsers.add_parser("eval",
 31 |                                         help='evaluate a trained ROCK network using COCOeval')
 32 | 
 33 |     image_folder_parser = subparsers.add_parser("create_image_folder",
 34 |                                                 help="creates an image folder from the preprocessed NYUv2 dataset")
 35 | 
 36 |     detect_parser = subparsers.add_parser("detect",
 37 |                                           help='detect objects using a trained network')
 38 | 
 39 |     # Arguments for preprocessing
 40 |     prep_parser.add_argument('--dataset_path',
 41 |                              help='path to the NYUv2 dataset (default: %(default)s)',
 42 |                              default='data/nyu_depth_v2_labeled.mat')
 43 |     prep_parser.add_argument('--splits_path',
 44 |                              help='path to the NYUv2 official splits (default: %(default)s)',
 45 |                              default='data/splits.mat')
 46 |     prep_parser.add_argument('--normals_path',
 47 |                              help='path to the folder containing the normals and normals masks (default: %(default)s)',
 48 |                              default='data/normals_gt')
 49 |     prep_parser.add_argument('--val_split_path',
 50 |                              help='path containing the samples to be used for validation. '
 51 |                                   'No validation data if no path is provided (default: %(default)s)',
 52 |                              default=None)
 53 |     prep_parser.add_argument('--train_save_path',
 54 |                              help='path where the train data will be saved (default: %(default)s)',
 55 |                              default='data/train_test/nyuv2_train')
 56 |     prep_parser.add_argument('--test_save_path',
 57 |                              help='path where the test data will be saved (default: %(default)s)',
 58 |                              default='data/train_test/nyuv2_test')
 59 |     prep_parser.add_argument('--val_save_path',
 60 |                              help='path where the val data will be saved '
 61 |                                   '(if an argument for --val_split_path is provided) (default: %(default)s)',
 62 |                              default='data/train_val_test/nyuv2_val')
 63 |     prep_parser.add_argument('--no_verbose',
 64 |                              help='disable verbose', action='store_true')
 65 | 
 66 |     # Arguments for training
 67 |     train_parser.add_argument('--train_path',
 68 |                               help='path to the training data (default: %(default)s)',
 69 |                               default='data/train_test/nyuv2_train')
 70 |     train_parser.add_argument('--val_path',
 71 |                               help='path to the validation data (default: %(default)s)',
 72 |                               default=None)
 73 |     train_parser.add_argument('--device',
 74 |                               help='gpu used for training (type: %(type)s) (default: %(default)s)',
 75 |                               type=torch.device, default='cuda')
 76 |     train_parser.add_argument('--num_iters',
 77 |                               help='number of iterations (type: %(type)s) (default: %(default)s)',
 78 |                               type=int, default=30_000)
 79 |     train_parser.add_argument('--lr',
 80 |                               help='learning rate (type: %(type)s) (default: %(default)s)',
 81 |                               type=float, default=5e-5)
 82 |     train_parser.add_argument('--weight_decay',
 83 |                               help='weight decay for optimizer (type: %(type)s) (default: %(default)s)',
 84 |                               type=float, default=2e-3)
 85 |     train_parser.add_argument('--scheduler_milestones',
 86 |                               help='iteration milestones at which the learning rate is decreased '
 87 |                                    '(type: %(type)s) (default: %(default)s)',
 88 |                               type=int, default=[25_000, ], nargs='+')
 89 |     train_parser.add_argument('--scheduler_gamma',
 90 |                               help='gamma value for the scheduler, by which the learning rate is multiplied '
 91 |                                    'at each milestone (type: %(type)s) (default: %(default)s)',
 92 |                               type=float, default=0.1)
 93 |     train_parser.add_argument('--force_crops',
 94 |                               help='crop all training images during data augmentation, '
 95 |                                    'instead of leaving some images uncropped',
 96 |                               action='store_true')
 97 |     train_parser.add_argument('--no_rock',
 98 |                               help='remove rock block from model '
 99 |                                    '(obtains a baseline single shot detector, no auxiliary tasks)',
100 |                               action='store_true')
101 |     train_parser.add_argument('--aux_tasks',
102 |                               help='list of auxiliary tasks to train on (type: %(type)s) (default: %(default)s)',
103 |                               type=str, default=['scene', 'depth', 'normals'], nargs='*')
104 |     train_parser.add_argument('--use_all_priors_conf_loss',
105 |                               help='switches to a loss taking into account all negative examples (all priors) instead '
106 |                                    'of just the top negative examples for the confidence loss.',
107 |                               action='store_true')
108 |     train_parser.add_argument('--writer_path',
109 |                               help='path to the folder where the tensorboard runs will be stored '
110 |                                    '(i.e. data/runs/rock) (default: %(default)s)',
111 |                               default=None)
112 |     train_parser.add_argument('--save_path',
113 |                               help='path to the folder where the model weights will be saved '
114 |                                    '(i.e. data/models/rock) (default: %(default)s)',
115 |                               default='data/models/default_model/')
116 |     train_parser.add_argument('--checkpoint_path',
117 |                               help='path to the folder where a trained model is saved '
118 |                                    '(i.e. models/rock). If provided, training will be resumed on that model '
119 |                                    '(default: %(default)s)',
120 |                               default=None)
121 |     train_parser.add_argument('--coco_json_save_path',
122 |                               help='path to which ground truth and prediction JSON files are saved '
123 |                                    'using the COCO data format, which are then used for evaluation '
124 |                                    '(more info: https://cocodataset.org/#format-data) (default: %(default)s)',
125 |                               default='data/eval')
126 |     train_parser.add_argument('--save_best_on_val',
127 |                               help='saves the model with the best mAP on val data',
128 |                               action='store_true')
129 |     train_parser.add_argument('--val_eval_freq',
130 |                               help='frequency at which the model is evaluated on the validation data, in epochs. '
131 |                                    'If None, model is never evaluated (type: %(type)s) (default: %(default)s)',
132 |                               type=int_or_none, default=10)
133 |     train_parser.add_argument('--train_eval_freq',
134 |                               help='frequency at which the model is evaluated on the training data, in epochs. '
135 |                                    'If None, model is never evaluated (type: %(type)s) (default: %(default)s)',
136 |                               type=int_or_none, default=50)
137 |     train_parser.add_argument('--image_to_tb_freq',
138 |                               help='frequency at which an image grid is added to Tensorboard, in epochs. '
139 |                                    'If None, no image grid is added (type: %(type)s) (default: %(default)s)',
140 |                               type=int_or_none, default=20)
141 |     train_parser.add_argument('--model_save_freq',
142 |                               help='frequency at which the model is saved, in epochs. '
143 |                                    'If None, no model is saved at a specific epoch number '
144 |                                    '(but can still be saved when training is finished or if --save_best_on_val is set) '
145 |                                    '(type: %(type)s) (default: %(default)s)',
146 |                               type=int_or_none, default=None)
147 |     train_parser.add_argument('--no_verbose',
148 |                               help='disable verbose', action='store_true')
149 | 
150 |     # Arguments for evaluation
151 |     eval_parser.add_argument('model_path',
152 |                              help='path containing the model weights')
153 |     eval_parser.add_argument('--test_path',
154 |                              help='path to the folder containing the test data on which to run the evaluation '
155 |                                   '(default: %(default)s)',
156 |                              default='data/train_test/nyuv2_test')
157 |     eval_parser.add_argument('--device',
158 |                              help='gpu used for evaluating (type: %(type)s) (default: %(default)s)',
159 |                              type=torch.device, default='cuda')
160 |     eval_parser.add_argument('--no_rock',
161 |                              help='remove rock block from model '
162 |                                   '(obtains a baseline single shot detector, no auxiliary tasks)',
163 |                              action='store_true')
164 |     eval_parser.add_argument('--aux_tasks',
165 |                              help='list of auxiliary tasks to train on (type: %(type)s) (default: %(default)s)',
166 |                              type=str, default=['scene', 'depth', 'normals'], nargs='*')
167 |     eval_parser.add_argument('--coco_json_save_path',
168 |                              help='path to which ground truth and prediction JSON files are saved '
169 |                                   'using the COCO data format, which are then used for evaluation '
170 |                                   '(more info: https://cocodataset.org/#format-data) (default: %(default)s)',
171 |                              default='data/eval')
172 |     eval_parser.add_argument('--show_all_cats',
173 |                              help='show the mAP for all categories',
174 |                              action='store_true')
175 |     eval_parser.add_argument('--no_verbose',
176 |                              help='disable verbose', action='store_true')
177 | 
178 |     # Arguments for image folder creation
179 |     image_folder_parser.add_argument('--data_path',
180 |                                      help='path to the folder containing the images to extract (default: %(default)s)',
181 |                                      default='data/train_test/nyuv2_test')
182 |     image_folder_parser.add_argument('--save_path',
183 |                                      help='path to the folder in which to save the new images (default: %(default)s)',
184 |                                      default='data/detection/images')
185 |     image_folder_parser.add_argument('--no_verbose',
186 |                                      help='disable verbose', action='store_true')
187 | 
188 |     # Arguments for object detection
189 |     detect_parser.add_argument('model_path',
190 |                                help='path containing the model weights')
191 |     detect_parser.add_argument('--image_path',
192 |                                help='path to the folder in which the images are saved (default: %(default)s)',
193 |                                default='data/detection/images')
194 |     detect_parser.add_argument('--detection_output_path',
195 |                                help='path to the folder in which the object detections are saved (default: %(default)s)',
196 |                                default='data/detection/output')
197 |     detect_parser.add_argument('--scene_output_path',
198 |                                help='path to the folder where the scene predictions will be saved. '
199 |                                     'Only works if the model contains a ROCK block (default: %(default)s)',
200 |                                default=None)
201 |     detect_parser.add_argument('--depth_output_path',
202 |                                help='path to the folder where the depth predictions will be saved. '
203 |                                     'Only works if the model contains a ROCK block (default: %(default)s)',
204 |                                default=None)
205 |     detect_parser.add_argument('--normals_output_path',
206 |                                help='path to the folder where the surface normals predictions will be saved. '
207 |                                     'Only works if the model contains a ROCK block (default: %(default)s)',
208 |                                default=None)
209 |     detect_parser.add_argument('--device',
210 |                                help='device used for object detection (type: %(type)s) (default: %(default)s)',
211 |                                type=torch.device, default='cuda')
212 |     detect_parser.add_argument('--no_rock',
213 |                                help='remove rock block from model '
214 |                                     '(obtains a baseline single shot detector, no auxiliary tasks)',
215 |                                action='store_true')
216 |     detect_parser.add_argument('--aux_tasks',
217 |                                help='list of auxiliary tasks to train on (type: %(type)s) (default: %(default)s)',
218 |                                type=str, default=['scene', 'depth', 'normals'], nargs='*')
219 |     detect_parser.add_argument('--conf_threshold',
220 |                                help='only show objects above a certain confidence threshold '
221 |                                     '(type: %(type)s) (default: %(default)s)',
222 |                                type=float, default=0.4)
223 |     detect_parser.add_argument('--get_throughput',
224 |                                help='shows the throughput (images/sec) of the model (forward pass only). '
225 |                                     'This disables saving the results of the object detection to a folder',
226 |                                action='store_true')
227 |     detect_parser.add_argument('--no_verbose',
228 |                                help='disable verbose', action='store_true')
229 | 
230 |     args = parser.parse_args()
231 | 
232 |     return args
233 | 
234 | 
235 | def disable_rock_for_empty_aux_tasks(args: argparse.Namespace) -> argparse.Namespace:
236 |     """ Disable the rock block if no aux tasks are given, and transform aux_tasks to a tuple
237 |     """
238 |     args.aux_tasks = tuple(args.aux_tasks)
239 |     if not args.aux_tasks:
240 |         args.no_rock = True
241 | 
242 |     return args
243 | 
244 | 
245 | def main() -> None:
246 |     """ Parses the command-line arguments and calls the appropriate function
247 |     """
248 |     args = cli()
249 | 
250 |     if args.command == 'prep':
251 |         from rock.prep import prep_data
252 |         prep_data(dataset_path=args.dataset_path, splits_path=args.splits_path, normals_path=args.normals_path,
253 |                   train_save_path=args.train_save_path, test_save_path=args.test_save_path,
254 |                   val_save_path=args.val_save_path, val_split_path=args.val_split_path, verbose=not args.no_verbose)
255 | 
256 |     if args.command == 'train':
257 |         from rock.trainer import train
258 |         args = disable_rock_for_empty_aux_tasks(args)
259 |         train(train_path=args.train_path, val_path=args.val_path, device=args.device, num_iters=args.num_iters,
260 |               lr=args.lr, weight_decay=args.weight_decay, scheduler_milestones=args.scheduler_milestones,
261 |               scheduler_gamma=args.scheduler_gamma, forced_crops=args.force_crops,
262 |               aux=not args.no_rock, aux_tasks=args.aux_tasks, use_all_priors_conf_loss=args.use_all_priors_conf_loss,
263 |               writer_path=args.writer_path, save_path=args.save_path, checkpoint_path=args.checkpoint_path,
264 |               coco_json_save_path=args.coco_json_save_path,  save_best_on_val=args.save_best_on_val,
265 |               val_eval_freq=args.val_eval_freq, train_eval_freq=args.train_eval_freq,
266 |               image_to_tb_freq=args.image_to_tb_freq, model_save_freq=args.model_save_freq, verbose=not args.no_verbose)
267 | 
268 |     if args.command == 'eval':
269 |         from rock.eval import evaluate_model
270 |         args = disable_rock_for_empty_aux_tasks(args)
271 |         evaluate_model(model_path=args.model_path, test_path=args.test_path, device=args.device,
272 |                        aux=not args.no_rock, aux_tasks=args.aux_tasks, coco_json_save_path=args.coco_json_save_path,
273 |                        show_all_cats=args.show_all_cats, verbose=not args.no_verbose)
274 | 
275 |     if args.command == 'create_image_folder':
276 |         from rock.datasets.image_folder import extract_image_and_save_to_folder
277 |         extract_image_and_save_to_folder(data_folder_path=args.data_path, save_folder_path=args.save_path,
278 |                                          verbose=not args.no_verbose)
279 | 
280 |     if args.command == 'detect':
281 |         from rock.detect import object_detection
282 |         args = disable_rock_for_empty_aux_tasks(args)
283 |         object_detection(model_path=args.model_path, image_folder_path=args.image_path,
284 |                          detection_output_path=args.detection_output_path, scene_output_path=args.scene_output_path,
285 |                          depth_output_path=args.depth_output_path, normals_output_path=args.normals_output_path,
286 |                          device=args.device, aux=not args.no_rock, aux_tasks=args.aux_tasks,
287 |                          conf_threshold=args.conf_threshold, throughput=args.get_throughput,
288 |                          verbose=not args.no_verbose)
289 | 
290 | 
291 | if __name__ == '__main__':
292 |     main()
293 | 


--------------------------------------------------------------------------------