├── .gitignore ├── CodeSLAM.ipynb ├── Makefile ├── README.md ├── U-Net ├── Unet.py ├── compute_errors.py └── image_utils.py ├── preprocessing.py ├── read_protobuf.py ├── requirements.txt └── scenenet.proto /.gitignore: -------------------------------------------------------------------------------- 1 | # datasets 2 | data/* 3 | 4 | # vscode changes 5 | .vscode 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | scenenet_pb2.py 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | env/ 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *,cover 53 | .hypothesis/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # IPython Notebook 77 | .ipynb_checkpoints 78 | 79 | # pyenv 80 | .python-version 81 | 82 | # celery beat schedule file 83 | celerybeat-schedule 84 | 85 | # dotenv 86 | .env 87 | 88 | # virtualenv 89 | venv/ 90 | ENV/ 91 | 92 | # Spyder project settings 93 | .spyderproject 94 | 95 | # Rope project settings 96 | .ropeproject -------------------------------------------------------------------------------- /CodeSLAM.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "CodeSLAM.ipynb", 7 | "provenance": [], 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | } 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "id": "view-in-github", 20 | "colab_type": "text" 21 | }, 22 | "source": [ 23 | "\"Open" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": { 29 | "id": "8_azmJjSK6Fu", 30 | "colab_type": "text" 31 | }, 32 | "source": [ 33 | "#CodeSLAM\n", 34 | "\n", 35 | "## Abstract" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "metadata": { 41 | "id": "gTygVZ5SIC4O", 42 | "colab_type": "code", 43 | "colab": {} 44 | }, 45 | "source": [ 46 | "from torch import vis\n", 47 | "import matplotlib as plt\n", 48 | "import torch" 49 | ], 50 | "execution_count": 0, 51 | "outputs": [] 52 | } 53 | ] 54 | } -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | protoc --python_out=./ scenenet.proto 3 | 4 | clean: 5 | $(RM) scenenet_pb2.py 6 | $(RM) -r __pycache__ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CodeSLAM 2 | 3 | PyTorch implementation of [CodeSLAM - Learning a Compact, Optimisable Representation for Dense Visual SLAM](https://arxiv.org/pdf/1804.00874.pdf). 4 | 5 | ## Summary 6 | 7 | ### Problems it tries to tackle/solve 8 | - Representation of geometry in real 3D perception systems. 9 | - Dense representations, possibly augmented with semantic labels are high dimensional and unsuitable for probabilistic inference. 10 | - Sparse representations, which avoid these problems but capture only partial scene information. 11 | 12 | ### The new approach/solution 13 | - New compact but dense representation of scene geometry, conditioned on the intensity data from a single image and generated from a code consisting of a small number of parameters. 14 | - Each keyframe can produce a depth map, but the code can be optimised jointly with pose variables and with the codes of overlapping keyframes, for global consistency. 15 | 16 | ### Introduction 17 | - As the uncertainty propagation quickly becomes intractable for large degrees of freedom, the approaches on SLAM are split into 2 categories: 18 | - sparse SLAM, representing geometry by a sparse set of features 19 | - dense SLAM, that attempts to retrieve a more complete description of the environment. 20 | - The geometry of natural scenes exhibits a high degree of order, so we may not need a large number of params to represent it. 21 | - Besides that, a scene could be decomposed into a set of semantic objects (e.g a chair) together with some internal params (e.g. size of chair, no of legs) and a pose. Other more general scene elements, which exhibit simple regularity, can be recognised and parametrised within SLAM systems. 22 | - A straightforward AE might oversimplify the reconstruction of natural scenes, the **novelty** is to condition the training on intensity images. 23 | - A **scene map** consists of a set of selected and estimated historical camera poses together with the corresponding captured images and supplementary local information such as depth estimates. The intensity images are usually required for additional tasks. 24 | - **Depth map estimate** becomes a function of corresponding intensity image and an unknown compact representation (referred to as **code**). 25 | - We can think of the image providing local details and the code supplying more global shape params and can be seen as a step towards enabling optimisation in general semantic space. 26 | - The **2 key contributions** of this paper are: 27 | - The derivation of a compact and optimisable representation of dense geometry by conditioning a depth autoencoder on intensity images. 28 | - The implementation of the first real-time targeted monocular system that achieves such a tight joint optimisation of motion and dense geometry. 29 | 30 | ## Usage 31 | - generate the python module for the protobuf: `protoc --python_out=./ scenenet.proto` 32 | 33 | ## Results 34 | 35 | ## Requirements 36 | - Python 3.4+ 37 | - PyTorch 1.0+ 38 | - Torchvision 0.4.0+ -------------------------------------------------------------------------------- /U-Net/Unet.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | import os 4 | 5 | def conv3x3(in_planes, out_planes, stride=1): 6 | """3x3 conv layer with padding.""" 7 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) 8 | 9 | class Bottleneck(nn.Module): 10 | expansion = 4 11 | 12 | def __init__(self, inplanes, planes, stride=1, downsample=None): 13 | super(Bottleneck, self).__init__() 14 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) 15 | self.bn1 = nn.BatchNorm2d(planes) 16 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 17 | self.bn2 = nn.BatchNorm2d(planes) 18 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 19 | self.bn3 = nn.BatchNorm2d(planes * 4) 20 | self.relu = nn.ReLU(inplace=True) 21 | self.downsample = downsample 22 | self.stride = stride 23 | 24 | def forward(self, x): 25 | residual = x 26 | 27 | out = self.conv1(x) 28 | out = self.bn1(out) 29 | out = self.relu(out) 30 | 31 | out = self.conv2(out) 32 | out = self.bn2(out) 33 | out = self.relu(out) 34 | 35 | out = self.conv3(out) 36 | out = self.bn3(out) 37 | 38 | if self.downsample is not None: 39 | residual = self.downsample(x) 40 | 41 | out += residual 42 | out = self.relu(out) 43 | 44 | return out 45 | 46 | def get_incoming_shape(incoming): 47 | size = incoming.size() 48 | # returns the incoming data shape as a list 49 | return [size[0], size[1], size[2], size[3]] 50 | 51 | def interleave(tensors, axis): 52 | # change the first element (batch_size to -1) 53 | old_shape = get_incoming_shape(tensors[0])[1:] 54 | new_shape = [-1] + old_shape 55 | 56 | # double 1 dimension 57 | new_shape[axis] *= len(tensors) 58 | 59 | # pack the tensors on top of each other 60 | stacked = torch.stack(tensors, axis+1) 61 | 62 | # reshape and return 63 | reshaped = stacked.view(new_shape) 64 | return reshaped 65 | 66 | class UnpoolingAsConvolution(nn.Module): 67 | def __init__(self, inplanes, planes): 68 | super(UnpoolingAsConvolution, self).__init__() 69 | 70 | # interleaving convolutions 71 | self.conv_A = nn.Conv2d(in_channels=inplanes, out_channels=planes, kernel_size=(3, 3), stride=1, padding=1) 72 | self.conv_B = nn.Conv2d(in_channels=inplanes, out_channels=planes, kernel_size=(2, 3), stride=1, padding=0) 73 | self.conv_C = nn.Conv2d(in_channels=inplanes, out_channels=planes, kernel_size=(3, 2), stride=1, padding=0) 74 | self.conv_D = nn.Conv2d(in_channels=inplanes, out_channels=planes, kernel_size=(2, 2), stride=1, padding=0) 75 | 76 | def forward(self, x): 77 | output_a = self.conv_A(x) 78 | 79 | padded_b = nn.functional.pad(x, (1, 1, 0, 1)) 80 | output_b = self.conv_B(padded_b) 81 | 82 | padded_c = nn.functional.pad(x, (0, 1, 1, 1)) 83 | output_c = self.conv_C(padded_c) 84 | 85 | padded_d = nn.functional.pad(x, (0, 1, 0, 1)) 86 | output_d = self.conv_D(padded_d) 87 | 88 | left = interleave([output_a, output_b], axis=2) 89 | right = interleave([output_c, output_d], axis=2) 90 | y = interleave([left, right], axis=3) 91 | return y 92 | 93 | class UpProjection(nn.Module): 94 | def __init__(self, inplanes, planes): 95 | super(UpProjection, self).__init__() 96 | 97 | self.unpool_main = UnpoolingAsConvolution(inplanes, planes) 98 | self.unpool_res = UnpoolingAsConvolution(inplanes, planes) 99 | 100 | self.main_branch = nn.Sequential( 101 | self.unpool_main, 102 | nn.BatchNorm2d(planes), 103 | nn.ReLU(inplace=False), 104 | nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1), 105 | nn.BatchNorm2d(planes) 106 | ) 107 | 108 | self.residual_branch = nn.Sequential( 109 | self.unpool_res, 110 | nn.BatchNorm2d(planes), 111 | ) 112 | 113 | self.relu = nn.ReLU(inplace=False) 114 | 115 | def forward(self, input_data): 116 | x = self.main_branch(input_data) 117 | res = self.residual_branch(input_data) 118 | x += res 119 | x = self.relu(x) 120 | return x 121 | 122 | class ConConv(nn.Module): 123 | def __init__(self, inplanes_x1, inplanes_x2, planes): 124 | super(ConConv, self).__init__() 125 | self.conv = nn.Conv2d(inplanes_x1 + inplanes_x2, planes, kernel_size=1, bias=True) 126 | 127 | def forward(self, x1, x2): 128 | x1 = torch.cat([x2, x1], dim=1) 129 | x1 = self.conv(x1) 130 | return x1 131 | 132 | class ResnetUnetHybrid(nn.Module): 133 | def __init__(self, block, layers): 134 | self.inplanes = 64 135 | 136 | # resnet layers 137 | super(ResnetUnetHybrid, self).__init__() 138 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) 139 | self.bn1 = nn.BatchNorm2d(64) 140 | self.relu = nn.ReLU(inplace=True) 141 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 142 | 143 | self.layer1 = self._make_layer(block, 64, layers[0]) 144 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 145 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 146 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 147 | 148 | # additional up projection layers parts 149 | self.conv2 = nn.Conv2d(2048, 1024, 1, bias=True) 150 | self.bn2 = nn.BatchNorm2d(1024) 151 | 152 | self.up_proj1 = UpProjection(1024, 512) 153 | self.up_proj2 = UpProjection(512, 256) 154 | self.up_proj3 = UpProjection(256, 128) 155 | self.up_proj4 = UpProjection(128, 64) 156 | 157 | self.drop = nn.Dropout(0.5, False) 158 | self.conv3 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1, bias=True) 159 | 160 | # padding + concat for unet stuff 161 | self.con_conv1 = ConConv(1024, 512, 512) 162 | self.con_conv2 = ConConv(512, 256, 256) 163 | self.con_conv3 = ConConv(256, 128, 128) 164 | self.con_conv4 = ConConv(64, 64, 64) 165 | 166 | for m in self.modules(): 167 | if isinstance(m, nn.Conv2d): 168 | nn.init.normal_(m.weight, 0, 0.01) 169 | 170 | elif isinstance(m, nn.BatchNorm2d): 171 | nn.init.constant_(m.weight, 1) 172 | nn.init.constant_(m.bias, 0) 173 | 174 | def _make_layer(self, block, planes, blocks, stride=1): 175 | downsample = None 176 | if stride != 1 or self.inplanes != planes * block.expansion: 177 | downsample = nn.Sequential( 178 | nn.Conv2d(self.inplanes, planes * block.expansion, 179 | kernel_size=1, stride=stride, bias=False), 180 | nn.BatchNorm2d(planes * block.expansion), 181 | ) 182 | 183 | layers = list() 184 | layers.append(block(self.inplanes, planes, stride, downsample)) 185 | self.inplanes = planes * block.expansion 186 | for i in range(1, blocks): 187 | layers.append(block(self.inplanes, planes)) 188 | 189 | return nn.Sequential(*layers) 190 | 191 | def forward(self, x): 192 | x = self.conv1(x) 193 | x = self.bn1(x) 194 | x_to_conv4 = self.relu(x) 195 | 196 | x = self.maxpool(x_to_conv4) 197 | x_to_conv3 = self.layer1(x) 198 | x_to_conv2 = self.layer2(x_to_conv3) 199 | x_to_conv1 = self.layer3(x_to_conv2) 200 | x = self.layer4(x_to_conv1) 201 | 202 | # additional layers 203 | x = self.conv2(x) 204 | x = self.bn2(x) 205 | 206 | # up project part 207 | x = self.up_proj1(x) 208 | x = self.con_conv1(x, x_to_conv1) 209 | 210 | x = self.up_proj2(x) 211 | x = self.con_conv2(x, x_to_conv2) 212 | 213 | x = self.up_proj3(x) 214 | x = self.con_conv3(x, x_to_conv3) 215 | 216 | x = self.up_proj4(x) 217 | x = self.con_conv4(x, x_to_conv4) 218 | 219 | x = self.drop(x) 220 | x = self.conv3(x) 221 | x = self.relu(x) 222 | 223 | return x 224 | 225 | @classmethod 226 | def load_pretrained(cls, device, load_path='hyb_net_weights.model'): 227 | model = cls(Bottleneck, [3, 4, 6, 3]) 228 | 229 | # download the weights if they are not present 230 | if not os.path.exists(load_path): 231 | print('Downloading model weights...') 232 | os.system('wget https://www.dropbox.com/s/amad4ko9opi4kts/hyb_net_weights.model') 233 | 234 | model = model.to(device) 235 | model.load_state_dict(torch.load(load_path, map_location=device)) 236 | 237 | return model -------------------------------------------------------------------------------- /U-Net/compute_errors.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import sys 4 | import cv2 5 | import numpy as np 6 | import torch 7 | import torch.nn.functional as F 8 | from Unet import ResnetUnetHybrid 9 | import image_utils 10 | 11 | subfolders = range(0, 1000) 12 | test_dir = '../data/val' 13 | 14 | def show_test_files(): 15 | # build test files list 16 | test_paths = [os.path.join(os.path.join(test_dir, str(f))) for f in subfolders] 17 | test_img_paths = [] 18 | for img_path in test_paths: 19 | current_image_path = os.path.join(img_path, 'photo') 20 | for filename in glob.iglob(current_image_path + '/*', recursive=True): 21 | test_img_paths.append(filename) 22 | 23 | test_img_paths.sort() 24 | # build labels list 25 | test_label_paths = [] 26 | for img_path in test_paths: 27 | current_image_path = os.path.join(img_path, 'depth') 28 | for filename in glob.iglob(current_image_path + '/*', recursive=True): 29 | test_label_paths.append(filename) 30 | 31 | test_label_paths.sort() 32 | 33 | return test_img_paths, test_label_paths 34 | 35 | if __name__ == '__main__': 36 | 37 | -------------------------------------------------------------------------------- /U-Net/image_utils.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import math 4 | import matplotlib.pyplot as plt 5 | from torchvision import transforms 6 | 7 | HEIGHT = 240 8 | WIDTH = 320 9 | 10 | def show_img_and_pred(img, depth): 11 | """Plot an image and a corresponding prediction next to each other.""" 12 | plt.figure() 13 | plt.subplot(1, 2, 1) 14 | plt.imshow(img) 15 | plt.subplot(1, 2, 2) 16 | pred = np.transpose(depth, (1, 2, 0)) 17 | plt.imshow(pred[:, :, 0]) 18 | plt.show() 19 | 20 | def scale_image(img, scale=None): 21 | """Resize/scale an image. If a scale is not provided, scale it closer to HEIGHT x WIDTH.""" 22 | # if scale is None, scale to the longer size 23 | if scale is None: 24 | scale = max(WIDTH / img.shape[1], HEIGHT / img.shape[0]) 25 | 26 | new_size = (math.ceil(img.shape[1] * scale), math.ceil(img.shape[0] * scale)) 27 | image = cv2.resize(img, new_size, interpolation=cv2.INTER_NEAREST) 28 | return image 29 | 30 | def img_transform(img): 31 | """Normalize an image.""" 32 | data_transform = transforms.Compose([ 33 | transforms.ToTensor(), 34 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 35 | ]) 36 | img = data_transform(img) 37 | return img 38 | 39 | def depth_to_grayscale(depth, max_dist=10.0): 40 | """Transform a prediction into a grayscale 8-bit image.""" 41 | depth = np.transpose(depth, (1, 2, 0)) 42 | depth[depth > max_dist] = max_dist 43 | depth = depth / max_dist 44 | 45 | depth = np.array(depth * 255.0, dtype=np.uint8) 46 | depth = cv2.resize(depth, (WIDTH, HEIGHT)) 47 | 48 | bgr_depth_img = cv2.cvtColor(depth, cv2.COLOR_GRAY2BGR) 49 | bgr_depth_img = np.clip(bgr_depth_img, 0, 255) 50 | return bgr_depth_img -------------------------------------------------------------------------------- /preprocessing.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import multiprocessing as mp 3 | import numpy as np 4 | import os 5 | from PIL import Image 6 | import statistics 7 | import sys 8 | 9 | 10 | TRAINING_SET_PATH = 'data/train/' 11 | DATASET_SUBFOLDERS = range(0, 15) 12 | IMAGE_NEW_WIDTH = 256 13 | IMAGE_NEW_HEIGHT = 192 14 | 15 | def create_intensity_images_from_rgb_images_folder(path, subfolder): 16 | print("create intensity is called") 17 | for filename in glob.iglob(path + str(subfolder) + '/*/photo/*', recursive=True): 18 | # don't duplicate intensity images 19 | if not filename.endswith("_intensity.jpg"): 20 | img = Image.open(filename).convert('L') 21 | image_name_without_extension = filename.split('.')[0] 22 | intensity_image_name = image_name_without_extension + '_intensity.jpg' 23 | print(intensity_image_name) 24 | img.save(intensity_image_name) 25 | 26 | def resize_intensity_images(path, new_width, new_height, subfolder): 27 | for filename in glob.iglob(path + str(subfolder) + '/*/photo/*_intensity.jpg', recursive=True): 28 | print(filename) 29 | img = Image.open(filename) 30 | resized_image = img.resize((new_width, new_height)) 31 | image_name_without_extension = filename.split('.')[0] 32 | resized_intensity_image_name = image_name_without_extension + '_resized.jpg' 33 | resized_image.save(resized_intensity_image_name, "JPEG", optimize=True) 34 | # remove the original intensity image 35 | os.remove(filename) 36 | 37 | def scale_depth(image, average): 38 | for i in [0, len(image)-1]: 39 | image[i] = average / (average + image[i]) 40 | 41 | def normalize_depth_values(path, subfolder): 42 | for filename in glob.iglob(path + str(subfolder) + '/*/depth/*[0-9].png', recursive=True): 43 | print(filename) 44 | img = Image.open(filename) 45 | size = img.size 46 | image_values = img.histogram() 47 | average_depth = statistics.mean(image_values) 48 | if average_depth is 0: 49 | average_depth = 0.000001 50 | scale_depth(image_values, average_depth) 51 | image_array = np.array(image_values, dtype=np.float32) 52 | image = Image.new("L", size) 53 | image.putdata(image_array) 54 | normalized_image_name = filename.split('.')[0] + '_normalized.png' 55 | image.save(normalized_image_name, "PNG", optimize=True) 56 | 57 | def remove_normalized_depth_images(path): 58 | for filename in glob.iglob(path + '**/*/depth/*_normalized.png', recursive=True): 59 | os.remove(filename) 60 | 61 | def remove_intensity_images(path): 62 | for filename in glob.iglob(path + '**/*/photo/*_intensity.jpg', recursive=True): 63 | os.remove(filename) 64 | for filename in glob.iglob(path + '**/*/photo/*_resized.jpg', recursive=True): 65 | os.remove(filename) 66 | 67 | if __name__ == '__main__': 68 | pool = mp.Pool(mp.cpu_count()) 69 | # if no args are passed, don't alter images 70 | if len(sys.argv) is 1: 71 | print("You should specify resizing (r) or converting to intensity (i).") 72 | elif len(sys.argv) is 2: 73 | if str(sys.argv[1]) is "i": 74 | [pool.apply(create_intensity_images_from_rgb_images_folder, args=(TRAINING_SET_PATH, subfolder)) for subfolder in DATASET_SUBFOLDERS] 75 | elif str(sys.argv[1]) is "r": 76 | [pool.apply(resize_intensity_images, args=(TRAINING_SET_PATH, IMAGE_NEW_WIDTH, IMAGE_NEW_HEIGHT, subfolder)) for subfolder in DATASET_SUBFOLDERS] 77 | elif str(sys.argv[1]) is "n": 78 | [pool.apply(normalize_depth_values, args=(TRAINING_SET_PATH, subfolder)) for subfolder in DATASET_SUBFOLDERS] 79 | else: 80 | print("Invalid argument: use 'i' for convert images to intensity images, 'r' to resize the intensity images, 'n' for depth normalization or any combination of them") 81 | elif len(sys.argv) is 3: 82 | if (str(sys.argv[1]) is "i" and str(sys.argv[2]) is "r") or (str(sys.argv[2]) is "i" and str(sys.argv[1]) is "r"): 83 | [pool.apply(create_intensity_images_from_rgb_images_folder, args=(TRAINING_SET_PATH, subfolder)) for subfolder in DATASET_SUBFOLDERS] 84 | [pool.apply(resize_intensity_images, args=(TRAINING_SET_PATH, IMAGE_NEW_WIDTH, IMAGE_NEW_HEIGHT, subfolder)) for subfolder in DATASET_SUBFOLDERS] 85 | elif (str(sys.argv[1]) is "i" and str(sys.argv[2]) is "n") or (str(sys.argv[1]) is "n" and str(sys.argv[2]) is "i"): 86 | [pool.apply(create_intensity_images_from_rgb_images_folder, args=(TRAINING_SET_PATH, subfolder)) for subfolder in DATASET_SUBFOLDERS] 87 | [pool.apply(normalize_depth_values, args=(TRAINING_SET_PATH, subfolder)) for subfolder in DATASET_SUBFOLDERS] 88 | elif (str(sys.argv[1]) is "r" and str(sys.argv[2]) is "n") or (str(sys.argv[1]) is "n" and str(sys.argv[2]) is "r"): 89 | [pool.apply(resize_intensity_images, args=(TRAINING_SET_PATH, IMAGE_NEW_WIDTH, IMAGE_NEW_HEIGHT, subfolder)) for subfolder in DATASET_SUBFOLDERS] 90 | [pool.apply(normalize_depth_values, args=(TRAINING_SET_PATH, subfolder)) for subfolder in DATASET_SUBFOLDERS] 91 | else: 92 | print("Invalid arguments: use 'i' for convert images to intensity images, 'r' to resize the intensity images, 'n' for depth normalization or any combination of them") 93 | elif len(sys.argv) is 4: 94 | if ((str(sys.argv[1]) is "i" and str(sys.argv[2]) is "r" and str(sys.argv[3]) is "n") or 95 | (str(sys.argv[1]) is "i" and str(sys.argv[2]) is "n" and str(sys.argv[3]) is "r") or 96 | (str(sys.argv[1]) is "r" and str(sys.argv[2]) is "i" and str(sys.argv[3]) is "n") or 97 | (str(sys.argv[1]) is "r" and str(sys.argv[2]) is "n" and str(sys.argv[3]) is "i") or 98 | (str(sys.argv[1]) is "n" and str(sys.argv[2]) is "i" and str(sys.argv[3]) is "r") or 99 | (str(sys.argv[1]) is "n" and str(sys.argv[2]) is "r" and str(sys.argv[3]) is "i")): 100 | 101 | [pool.apply(create_intensity_images_from_rgb_images_folder, args=(TRAINING_SET_PATH, subfolder)) for subfolder in DATASET_SUBFOLDERS] 102 | [pool.apply(resize_intensity_images, args=(TRAINING_SET_PATH, IMAGE_NEW_WIDTH, IMAGE_NEW_HEIGHT, subfolder)) for subfolder in DATASET_SUBFOLDERS] 103 | [pool.apply(normalize_depth_values, args=(TRAINING_SET_PATH, subfolder)) for subfolder in DATASET_SUBFOLDERS] 104 | else: 105 | print("Too many arguments: use 'i' for convert images to intensity images, 'r' to resize the intensity images, 'n' for depth normalization or any combination of them") 106 | 107 | pool.close() 108 | 109 | -------------------------------------------------------------------------------- /read_protobuf.py: -------------------------------------------------------------------------------- 1 | import scenenet_pb2 as sn 2 | import os 3 | 4 | DATA_ROOT_PATH = 'data/train_0/train/0' 5 | PROTOBUF_PATH = 'data/train_protobufs/scenenet_rgbd_train_0.pb' 6 | 7 | # These functions produce a file path (on Linux systems) to the image given 8 | # a view and render path from a trajectory. As long the DATA_ROOT_PATH to the 9 | # root of the dataset is given. I.e. to either val or train 10 | def photo_path_from_view(render_path,view): 11 | photo_path = os.path.join(render_path,'photo') 12 | image_path = os.path.join(photo_path,'{0}.jpg'.format(view.frame_num)) 13 | return os.path.join(DATA_ROOT_PATH,image_path) 14 | 15 | def instance_path_from_view(render_path,view): 16 | photo_path = os.path.join(render_path,'instance') 17 | image_path = os.path.join(photo_path,'{0}.png'.format(view.frame_num)) 18 | return os.path.join(DATA_ROOT_PATH,image_path) 19 | 20 | def depth_path_from_view(render_path,view): 21 | photo_path = os.path.join(render_path,'depth') 22 | image_path = os.path.join(photo_path,'{0}.png'.format(view.frame_num)) 23 | return os.path.join(DATA_ROOT_PATH,image_path) 24 | 25 | 26 | if __name__ == '__main__': 27 | trajectories = sn.Trajectories() 28 | try: 29 | with open(PROTOBUF_PATH,'rb') as f: 30 | trajectories.ParseFromString(f.read()) 31 | except IOError: 32 | print('Scenenet protobuf data not found at location:{0}'.format(DATA_ROOT_PATH)) 33 | print('Please ensure you have copied the pb file to the data directory') 34 | 35 | print('Number of trajectories:{0}'.format(len(trajectories.trajectories))) 36 | for traj in trajectories.trajectories: 37 | layout_type = sn.SceneLayout.LayoutType.Name(traj.layout.layout_type) 38 | layout_path = traj.layout.model 39 | print('='*20) 40 | print('Render path:{0}'.format(traj.render_path)) 41 | print('Layout type:{0} path:{1}'.format(layout_type,layout_path)) 42 | print('='*20) 43 | print('') 44 | print('Number of instances: {0}'.format(len(traj.instances))) 45 | ''' 46 | The instances attribute of trajectories contains all of the information 47 | about the different instances. The instance.instance_id attribute provides 48 | correspondences with the rendered instance.png files. I.e. for a given 49 | trajectory, if a pixel is of value 1, the information about that instance, 50 | such as its type, semantic class, and wordnet id, is stored here. 51 | For more information about the exact information available refer to the 52 | scenenet.proto file. 53 | ''' 54 | for instance in traj.instances: 55 | instance_type = sn.Instance.InstanceType.Name(instance.instance_type) 56 | print('='*20) 57 | print('Instance id:{0}'.format(instance.instance_id)) 58 | print('Instance type:{0}'.format(instance_type)) 59 | if instance.instance_type != sn.Instance.BACKGROUND: 60 | print('Wordnet id:{0}'.format(instance.semantic_wordnet_id)) 61 | print('Plain english name:{0}'.format(instance.semantic_english)) 62 | if instance.instance_type == sn.Instance.LIGHT_OBJECT: 63 | light_type = sn.LightInfo.LightType.Name(instance.light_info.light_type) 64 | print('Light type:{0}'.format(light_type)) 65 | if instance.instance_type == sn.Instance.RANDOM_OBJECT: 66 | print('Object info:{0}'.format(instance.object_info)) 67 | print('-'*20) 68 | print('') 69 | print('Render path:{0}'.format(traj.render_path)) 70 | ''' 71 | The views attribute of trajectories contains all of the information 72 | about the rendered frames of a scene. This includes camera poses, 73 | frame numbers and timestamps. 74 | ''' 75 | for view in traj.views: 76 | print(photo_path_from_view(traj.render_path,view)) 77 | print(depth_path_from_view(traj.render_path,view)) 78 | print(instance_path_from_view(traj.render_path,view)) 79 | print(view) 80 | break -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.22.0 2 | Pillow==9.0.1 3 | protobuf==3.15.0 4 | six==1.14.0 5 | torch==1.4.0+cpu 6 | torchvision==0.5.0+cpu 7 | -------------------------------------------------------------------------------- /scenenet.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | package scenenet; 4 | 5 | message SceneLayout { 6 | enum LayoutType { 7 | BATHROOM = 1; 8 | BEDROOM = 2; 9 | KITCHEN = 3; 10 | LIVING_ROOM = 4; 11 | OFFICE = 5; 12 | } 13 | optional LayoutType layout_type = 1; 14 | // This is the name of the SceneNet model used for the layout 15 | optional string model = 2; 16 | } 17 | 18 | message LightInfo { 19 | enum LightType { 20 | SPHERE = 1; 21 | PARALLELOGRAM = 2; 22 | } 23 | optional LightType light_type = 1; 24 | // Light intensity 25 | optional Power light_output = 2; 26 | // This is the center for sphere type lights. And corner for others 27 | optional Position position = 3; 28 | // This is only for SPHERE lights 29 | optional float radius = 4; 30 | // This is only for PARALLELOGRAM lights 31 | optional Position v1 = 5; 32 | optional Position v2 = 6; 33 | } 34 | 35 | message RandomObjectInfo { 36 | optional string shapenet_hash = 1; 37 | optional float height_meters = 2; 38 | message Transformation { 39 | // The 3x4 matrix is as follows: 40 | // rotation_mat11 rotation_mat12 rotation_mat13 translation_x 41 | // rotation_mat21 rotation_mat22 rotation_mat23 translation_y 42 | // rotation_mat31 rotation_mat32 rotation_mat33 translation_y 43 | optional float translation_x = 1; 44 | optional float translation_y = 2; 45 | optional float translation_z = 3; 46 | optional float rotation_mat11 = 4; 47 | optional float rotation_mat12 = 5; 48 | optional float rotation_mat13 = 6; 49 | optional float rotation_mat21 = 7; 50 | optional float rotation_mat22 = 8; 51 | optional float rotation_mat23 = 9; 52 | optional float rotation_mat31 = 10; 53 | optional float rotation_mat32 = 11; 54 | optional float rotation_mat33 = 12; 55 | } 56 | // The transformation gives the transformation applies to an object, about 57 | // the center of the base plane of its axis-aligned bounding box. 58 | optional Transformation object_pose = 3; 59 | } 60 | 61 | message Instance { 62 | optional int32 instance_id = 1; 63 | optional string semantic_wordnet_id = 2; 64 | optional string semantic_english = 3; 65 | enum InstanceType { 66 | // This is the instance type when no object is present, e.g. because of 67 | // looking out a window into nothingness 68 | BACKGROUND = 1; 69 | // This is an object that is hard coded into the layout and does not 70 | // move. This type does not have a transformation or shapenet hash 71 | LAYOUT_OBJECT = 2; 72 | // This is a randomly positioned light source 73 | LIGHT_OBJECT = 3; 74 | // This means the object is a randomly positioned shapenet object. The 75 | // object has a transformation and scale parameter in the object_info 76 | // variable. 77 | RANDOM_OBJECT = 4; 78 | } 79 | optional InstanceType instance_type = 4; 80 | // This information is only filled in for the respective type 81 | optional LightInfo light_info = 5; 82 | optional RandomObjectInfo object_info = 6; 83 | } 84 | 85 | message Power { 86 | optional float r = 1; 87 | optional float g = 2; 88 | optional float b = 3; 89 | } 90 | 91 | message Position { 92 | optional float x = 1; 93 | optional float y = 2; 94 | optional float z = 3; 95 | } 96 | 97 | message Pose { 98 | // The position of these two points define the camera view. The y vector is 99 | // defined as [0,1,0]. For an example of how to calculate the camera view 100 | // coordinate system, see the python codebase. 101 | optional Position camera = 1; 102 | optional Position lookat = 2; 103 | optional float timestamp = 3; 104 | } 105 | 106 | message View { 107 | // These increment by the number of skip frames, i.e. 0,25,50...7475. 108 | optional int32 frame_num = 1; 109 | // The photo is rendered by integrating uniformly sampled 110 | // exposures between the following two poses 111 | optional Pose shutter_open = 2; 112 | optional Pose shutter_close = 3; 113 | } 114 | 115 | message Trajectory { 116 | optional SceneLayout layout = 1; 117 | // The first instances[0] is always the 'background' and 118 | // undefined class when for example looking out windows 119 | repeated Instance instances = 2; 120 | // These are ordered sequentially for a trajectory 121 | repeated View views = 3; 122 | // This stores the path from the root data directory to the trajectory data 123 | // folder. If the trajectories are stored as: 124 | // /path/i/extracted/{val/train}/0/123/photo/0.jpg 125 | // then this path will be '0/123' designating the trajectories folder 126 | optional string render_path = 4; 127 | } 128 | 129 | message Trajectories { 130 | // This is the root list which stores all of the available trajectories 131 | repeated Trajectory trajectories = 1; 132 | } --------------------------------------------------------------------------------