├── .gitignore ├── LICENSE ├── architecture.png ├── configs ├── datasets │ ├── argoverse.yml │ └── nuscenes.yml ├── defaults.yml ├── experiments │ └── test.yml └── models │ ├── pyramid.yml │ ├── ved.yml │ └── vpn.yml ├── readme.md ├── scripts ├── make_argoverse_labels.py └── make_nuscenes_labels.py ├── src ├── __init__.py ├── data │ ├── __init__.py │ ├── argoverse │ │ ├── __init__.py │ │ ├── dataset.py │ │ ├── splits.py │ │ └── utils.py │ ├── augmentation.py │ ├── data_factory.py │ ├── nuscenes │ │ ├── __init__.py │ │ ├── dataset.py │ │ ├── splits.py │ │ └── utils.py │ └── utils.py ├── models │ ├── __init__.py │ ├── criterion.py │ ├── model_factory.py │ ├── pyramid.py │ ├── ved.py │ └── vpn.py ├── nn │ ├── __init__.py │ ├── classifier.py │ ├── fpn.py │ ├── losses.py │ ├── pyramid.py │ ├── resampler.py │ ├── resnet.py │ ├── topdown.py │ └── transformer.py └── utils │ ├── __init__.py │ ├── configs.py │ ├── confusion.py │ ├── geometry.py │ └── visualise.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | notebooks 3 | logs 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | pip-wheel-metadata/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # celery beat schedule file 99 | celerybeat-schedule 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This work is licensed under the Creative Commons Attribution-NonCommercial 4.0 International License. To view a copy of this license, visit http://creativecommons.org/licenses/by-nc/4.0/. 2 | 3 | -------------------------------------------------------------------------------- /architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tom-roddick/mono-semantic-maps/f6b0f52857a4e622ae42bf24faaf9681cc089f76/architecture.png -------------------------------------------------------------------------------- /configs/datasets/argoverse.yml: -------------------------------------------------------------------------------- 1 | train_dataset: argoverse 2 | dataroot: ${DATA_ROOT}/argoverse/argoverse-tracking 3 | label_root: ${PROCESSED_ROOT}/argoverse/test 4 | img_size: [960, 600] 5 | class_weights: 6 | - 1.7 # drivable_area 7 | - 5.2 # vehicle 8 | - 22.0 # pedestrian 9 | - 9.6 # large_vehicle 10 | - 20.3 # bicycle 11 | - 9.6 # bus 12 | - 7.0 # trailer 13 | - 27.5 # motorcycle 14 | 15 | vpn: 16 | output_size : [38, 60] 17 | 18 | ved: 19 | bottleneck_dim: 28 -------------------------------------------------------------------------------- /configs/datasets/nuscenes.yml: -------------------------------------------------------------------------------- 1 | train_dataset: nuscenes 2 | dataroot: ${DATA_ROOT}/nuscenes 3 | nuscenes_version: v1.0-trainval 4 | label_root: ${PROCESSED_ROOT}/nuscenes/map-labels-v1.2 5 | img_size: [800, 600] 6 | num_class: 14 7 | class_weights: 8 | - 1.7 # drivable_area 9 | - 5.9 # ped_crossing 10 | - 3.3 # walkway 11 | - 4.6 # carpark 12 | - 8.0 # car 13 | - 10.3 # truck 14 | - 10.6 # bus 15 | - 6.9 # trailer 16 | - 11.8 # construction_vehicle 17 | - 30.1 # pedestrian 18 | - 33.6 # motorcycle 19 | - 41.2 # bicycle 20 | - 44.3 # traffic_cone 21 | - 15.9 # barrier 22 | 23 | # Prior probability of a positive prediction, used to initialise classifier 24 | prior: 25 | - 0.44679 # drivable_area 26 | - 0.02407 # ped_crossing 27 | - 0.14491 # walkway 28 | - 0.02994 # carpark 29 | - 0.02086 # car 30 | - 0.00477 # truck 31 | - 0.00156 # bus 32 | - 0.00189 # trailer 33 | - 0.00084 # construction_vehicle 34 | - 0.00119 # pedestrian 35 | - 0.00019 # motorcycle 36 | - 0.00012 # bicycle 37 | - 0.00031 # traffic_cone 38 | - 0.00176 # barrier 39 | 40 | ved: 41 | bottleneck_dim: 18 42 | 43 | vpn: 44 | output_size : [29, 50] -------------------------------------------------------------------------------- /configs/defaults.yml: -------------------------------------------------------------------------------- 1 | 2 | 3 | ### Training options ### 4 | 5 | # IDs of GPUs to use during training 6 | gpus: [0, 2, 3, 4] 7 | 8 | # Number of examples per mini-batch 9 | batch_size: 12 10 | 11 | # Number of dataloader threads 12 | num_workers: 8 13 | 14 | # Learning rate 15 | learning_rate: 0.1 16 | 17 | # Decay learning rate by a factor 10 after the following number of epochs 18 | lr_milestones: [150, 185] 19 | 20 | # Weight decay 21 | weight_decay: 0.0001 22 | 23 | # Directory to save experiment to 24 | logdir: logs 25 | 26 | # Number of epochs to train for 27 | num_epochs: 200 28 | 29 | # Number of examples per epoch 30 | epoch_size: 50000 31 | 32 | 33 | #### Data options #### 34 | 35 | # Dataset to train on 36 | train_dataset: nuscenes 37 | 38 | # Name of split used for training 39 | train_split: train 40 | 41 | # Name of split used for validation 42 | val_split: val 43 | 44 | # Root data directory 45 | dataroot: ${DATA_ROOT}/nuscenes 46 | 47 | # NuScenes dataset version 48 | nuscenes_version: v1.0-trainval 49 | 50 | # Directory containing pregenerated training labels 51 | label_root: ${PROCESSED_ROOT}/nuscenes/map-labels-v1.2 52 | 53 | # Input image size after downsampling 54 | img_size: [800, 600] 55 | 56 | # Hold out portion of train data to calibrate on 57 | hold_out_calibration: False 58 | 59 | # Class-specific weighting factors used to balance the cross entropy loss 60 | class_weights: 61 | - 1.7 # drivable_area 62 | - 5.9 # ped_crossing 63 | - 3.3 # walkway 64 | - 4.6 # carpark 65 | - 8.0 # car 66 | - 10.3 # truck 67 | - 10.6 # bus 68 | - 6.9 # trailer 69 | - 11.8 # construction_vehicle 70 | - 30.1 # pedestrian 71 | - 33.6 # motorcycle 72 | - 41.2 # bicycle 73 | - 44.3 # traffic_cone 74 | - 15.9 # barrier 75 | 76 | # Prior probability of a positive prediction, used to initialise classifier 77 | prior: 78 | - 0.44679 # drivable_area 79 | - 0.02407 # ped_crossing 80 | - 0.14491 # walkway 81 | - 0.02994 # carpark 82 | - 0.02086 # car 83 | - 0.00477 # truck 84 | - 0.00156 # bus 85 | - 0.00189 # trailer 86 | - 0.00084 # construction_vehicle 87 | - 0.00119 # pedestrian 88 | - 0.00019 # motorcycle 89 | - 0.00012 # bicycle 90 | - 0.00031 # traffic_cone 91 | - 0.00176 # barrier 92 | 93 | # Whether to use horizontal flips for data augmentation 94 | hflip: True 95 | 96 | # Top-left and bottom right coordinates of map region, in meters 97 | map_extents: [-25., 1., 25., 50.] 98 | 99 | # Spacing between adjacent grid cells in the map, in meters 100 | map_resolution: 0.25 101 | 102 | # Log loss to tensorboard every N iterations 103 | log_interval: 10 104 | 105 | # Visualise predictions every N iterations 106 | vis_interval: 200 107 | 108 | 109 | ### Model options ### 110 | 111 | # Architecture to train [pyramid | ved | vpn ] 112 | model: pyramid 113 | 114 | # Number of intermediate channels in the transformer layer 115 | tfm_channels: 64 116 | 117 | # Vertical extents of the region of interest, in meters 118 | ymin: -2 119 | ymax: 4 120 | 121 | # Approximate camera focal length used for constructing transformers 122 | focal_length: 630. 123 | 124 | # Topdown network options 125 | topdown: 126 | 127 | # Number of feature channels at each layer of the topdown network 128 | channels: 128 129 | 130 | # Number of blocks in each layer 131 | layers: [4, 4] 132 | 133 | # Upsampling factor in each stage of the topdown network 134 | strides: [1, 2] 135 | 136 | # Type of residual block to use [ basic | bottleneck ] 137 | blocktype: bottleneck 138 | 139 | # Number of output classes to predict 140 | num_class: 14 141 | 142 | # Whether to use Bayesian classifier 143 | bayesian: False 144 | 145 | # Number of samples used for Monte-Carlo inference 146 | mc_samples: 40 147 | 148 | # View parsing network options 149 | vpn: 150 | 151 | # Size of output feature maps 152 | output_size: [29, 50] 153 | 154 | # Number of channels in fully connected layer 155 | fc_dim: 256 156 | 157 | # Variational encoder-decoder network options 158 | ved: 159 | 160 | # Dimensions of bottleneck (depends on the size of input images) 161 | bottleneck_dim: 18 162 | 163 | # Loss function 164 | loss_fn: bce 165 | 166 | # Binary cross entropy loss weight 167 | xent_weight: 1.0 168 | 169 | # Max entropy uncertainty loss weight 170 | uncert_weight: 0.001 171 | 172 | # Focal loss parameters 173 | focal: 174 | alpha: 0.25 175 | gamma: 2 176 | 177 | # KL-Divergence loss weight (used by VED network) 178 | kld_weight: 0.0 179 | 180 | # Method of weighting classes in loss function 181 | weight_mode: sqrt_inverse 182 | 183 | # Threshold to treat prediction as positive 184 | score_thresh: 0.5 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | -------------------------------------------------------------------------------- /configs/experiments/test.yml: -------------------------------------------------------------------------------- 1 | logdir: logs 2 | -------------------------------------------------------------------------------- /configs/models/pyramid.yml: -------------------------------------------------------------------------------- 1 | model: pyramid 2 | tfm_channels: 64 3 | ymin: -2 4 | ymax: 4 5 | focal_length: 630. 6 | topdown: 7 | channels: 128 8 | layers: [4, 4] 9 | strides: [1, 2] 10 | blocktype: bottleneck 11 | num_class: 14 12 | -------------------------------------------------------------------------------- /configs/models/ved.yml: -------------------------------------------------------------------------------- 1 | model: ved 2 | xent_weight: 0.9 3 | uncert_weight: 0.001 4 | kld_weight: 0.1 -------------------------------------------------------------------------------- /configs/models/vpn.yml: -------------------------------------------------------------------------------- 1 | model: vpn -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Predicting Semantic Map Representations from Images with Pyramid Occupancy Networks 2 | 3 | This is the code associated with the paper [Predicting Semantic Map Representations from Images with Pyramid Occupancy Networks](https://arxiv.org/pdf/2003.13402.pdf), published at CVPR 2020. 4 | 5 | ![Pyramid Occupancy Network architecture](architecture.png) 6 | 7 | ## Data generation 8 | In our work we report results on two large-scale autonomous driving datasets: NuScenes and Argoverse. The birds-eye-view ground truth labels we use to train and evaluate our networks are generated by combining map information provided by the two datasets with 3D bounding box annotations, which we rasterise to produce a set of one-hot binary labels. We also make use of LiDAR point clouds to infer regions of the birds-eye-view which are completely occluded by buildings or other objects. 9 | 10 | ### NuScenes 11 | To train our method on NuScenes you will first need to 12 | 1. Download the NuScenes dataset which can be found at https://www.nuscenes.org/download. Only the metadata, keyframe and lidar blobs are necessary. 13 | 2. Download the map expansion pack. Note that to replicate our original results you should use the original version of the expansion (v1.0). The later versions fixed some bugs with the original maps so we would expect even better performance! 14 | 3. Install the NuScenes devkit from https://github.com/nutonomy/nuscenes-devkit 15 | 4. Cd to `mono-semantic-maps` 16 | 5. Edit the `configs/datasets/nuscenes.yml` file, setting the `dataroot` and `label_root` entries to the location of the NuScenes dataset and the desired ground truth folder respectively. 17 | 6. Run our data generation script: `python scripts/make_nuscenes_labels.py`. Bewarned there's a lot of data so this will take a few hours to run! 18 | 19 | ### Argoverse 20 | To train on the Argoverse dataset: 21 | 1. Download the Argoverse tracking data from https://www.argoverse.org/data.html#tracking-link. Our models were trained on version 1.1, you will need to download the four training blobs, validation blob, and the HD map data. 22 | 2. Install the Argoverse devkit from https://github.com/argoai/argoverse-api 23 | 3. Cd to `mono-semantic-maps` 24 | 5. Edit the `configs/datasets/argoverse.yml` file, setting the `dataroot` and `label_root` entries to the location of the install Argoverse data and the desired ground truth folder respectively. 25 | 5. Run our data generation script: `python scripts/make_argoverse_labels.py`. This script will also take a while to run! 26 | 27 | 28 | ## Training 29 | Once ground truth labels have been generated, you can train our method by running the `train.py` script in the root directory: 30 | ``` 31 | python train.py --dataset nuscenes --model pyramid 32 | ``` 33 | The `--dataset` flag allows you to specify the dataset to train on, either `'argoverse'` or `'nuscenes'`. The model flag allows training of the proposed method `'pyramid'`, or one of the baseline methods (`'vpn'` or `'ved'`). Additional command line options can be specified by passing a list of key-value pairs to the `--options` flag. The full list of configurable options can be found in the `configs/defaults.yml` file. 34 | 35 | -------------------------------------------------------------------------------- /scripts/make_argoverse_labels.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | from PIL import Image 5 | from progressbar import ProgressBar 6 | 7 | from argoverse.map_representation.map_api import ArgoverseMap 8 | from argoverse.data_loading.argoverse_tracking_loader \ 9 | import ArgoverseTrackingLoader 10 | from argoverse.utils.camera_stats import RING_CAMERA_LIST 11 | 12 | sys.path.append(os.path.abspath(os.path.join(__file__, '../..'))) 13 | 14 | from src.utils.configs import get_default_configuration 15 | from src.data.utils import get_visible_mask, get_occlusion_mask, \ 16 | encode_binary_labels 17 | from src.data.argoverse.utils import get_object_masks, get_map_mask 18 | 19 | 20 | def process_split(split, map_data, config): 21 | 22 | # Create an Argoverse loader instance 23 | path = os.path.join(os.path.expandvars(config.argoverse.root), split) 24 | print("Loading Argoverse tracking data at " + path) 25 | loader = ArgoverseTrackingLoader(path) 26 | 27 | for scene in loader: 28 | process_scene(split, scene, map_data, config) 29 | 30 | 31 | def process_scene(split, scene, map_data, config): 32 | 33 | print("\n\n==> Processing scene: " + scene.current_log) 34 | 35 | i = 0 36 | progress = ProgressBar( 37 | max_value=len(RING_CAMERA_LIST) * scene.num_lidar_frame) 38 | 39 | # Iterate over each camera and each frame in the sequence 40 | for camera in RING_CAMERA_LIST: 41 | for frame in range(scene.num_lidar_frame): 42 | progress.update(i) 43 | process_frame(split, scene, camera, frame, map_data, config) 44 | i += 1 45 | 46 | 47 | def process_frame(split, scene, camera, frame, map_data, config): 48 | 49 | # Compute object masks 50 | masks = get_object_masks(scene, camera, frame, config.map_extents, 51 | config.map_resolution) 52 | 53 | # Compute drivable area mask 54 | masks[0] = get_map_mask(scene, camera, frame, map_data, config.map_extents, 55 | config.map_resolution) 56 | 57 | # Ignore regions of the BEV which are outside the image 58 | calib = scene.get_calibration(camera) 59 | masks[-1] |= ~get_visible_mask(calib.K, calib.camera_config.img_width, 60 | config.map_extents, config.map_resolution) 61 | 62 | # Ignore regions of the BEV which are occluded (based on LiDAR data) 63 | lidar = scene.get_lidar(frame) 64 | cam_lidar = calib.project_ego_to_cam(lidar) 65 | masks[-1] |= get_occlusion_mask(cam_lidar, config.map_extents, 66 | config.map_resolution) 67 | 68 | # Encode masks as an integer bitmask 69 | labels = encode_binary_labels(masks) 70 | 71 | # Create a filename and directory 72 | timestamp = str(scene.image_timestamp_list_sync[camera][frame]) 73 | output_path = os.path.join(config.argoverse.label_root, split, 74 | scene.current_log, camera, 75 | f'{camera}_{timestamp}.png') 76 | os.makedirs(os.path.dirname(output_path), exist_ok=True) 77 | 78 | # Save encoded label file to disk 79 | Image.fromarray(labels.astype(np.int32), mode='I').save(output_path) 80 | 81 | 82 | if __name__ == '__main__': 83 | 84 | config = get_default_configuration() 85 | config.merge_from_file('configs/datasets/argoverse.yml') 86 | 87 | # Create an Argoverse map instance 88 | map_data = ArgoverseMap() 89 | 90 | for split in ['train', 'val']: 91 | process_split(split, map_data, config) 92 | 93 | 94 | -------------------------------------------------------------------------------- /scripts/make_nuscenes_labels.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | from PIL import Image 5 | from tqdm import tqdm 6 | from collections import OrderedDict 7 | 8 | from shapely.strtree import STRtree 9 | from nuscenes import NuScenes 10 | from nuscenes.map_expansion.map_api import NuScenesMap 11 | 12 | sys.path.append(os.path.abspath(os.path.join(__file__, '../..'))) 13 | 14 | from src.utils.configs import get_default_configuration 15 | from src.data.utils import get_visible_mask, get_occlusion_mask, transform, \ 16 | encode_binary_labels 17 | import src.data.nuscenes.utils as nusc_utils 18 | 19 | 20 | def process_scene(nuscenes, map_data, scene, config): 21 | 22 | # Get the map corresponding to the current sample data 23 | log = nuscenes.get('log', scene['log_token']) 24 | scene_map_data = map_data[log['location']] 25 | 26 | # Iterate over samples 27 | first_sample_token = scene['first_sample_token'] 28 | for sample in nusc_utils.iterate_samples(nuscenes, first_sample_token): 29 | process_sample(nuscenes, scene_map_data, sample, config) 30 | 31 | 32 | def process_sample(nuscenes, map_data, sample, config): 33 | 34 | # Load the lidar point cloud associated with this sample 35 | lidar_data = nuscenes.get('sample_data', sample['data']['LIDAR_TOP']) 36 | lidar_pcl = nusc_utils.load_point_cloud(nuscenes, lidar_data) 37 | 38 | # Transform points into world coordinate system 39 | lidar_transform = nusc_utils.get_sensor_transform(nuscenes, lidar_data) 40 | lidar_pcl = transform(lidar_transform, lidar_pcl) 41 | 42 | # Iterate over sample data 43 | for camera in nusc_utils.CAMERA_NAMES: 44 | sample_data = nuscenes.get('sample_data', sample['data'][camera]) 45 | process_sample_data(nuscenes, map_data, sample_data, lidar_pcl, config) 46 | 47 | 48 | def process_sample_data(nuscenes, map_data, sample_data, lidar, config): 49 | 50 | # Render static road geometry masks 51 | map_masks = nusc_utils.get_map_masks(nuscenes, 52 | map_data, 53 | sample_data, 54 | config.map_extents, 55 | config.map_resolution) 56 | 57 | # Render dynamic object masks 58 | obj_masks = nusc_utils.get_object_masks(nuscenes, 59 | sample_data, 60 | config.map_extents, 61 | config.map_resolution) 62 | masks = np.concatenate([map_masks, obj_masks], axis=0) 63 | 64 | # Ignore regions of the BEV which are outside the image 65 | sensor = nuscenes.get('calibrated_sensor', 66 | sample_data['calibrated_sensor_token']) 67 | intrinsics = np.array(sensor['camera_intrinsic']) 68 | masks[-1] |= ~get_visible_mask(intrinsics, sample_data['width'], 69 | config.map_extents, config.map_resolution) 70 | 71 | # Transform lidar points into camera coordinates 72 | cam_transform = nusc_utils.get_sensor_transform(nuscenes, sample_data) 73 | cam_points = transform(np.linalg.inv(cam_transform), lidar) 74 | masks[-1] |= get_occlusion_mask(cam_points, config.map_extents, 75 | config.map_resolution) 76 | 77 | # Encode masks as integer bitmask 78 | labels = encode_binary_labels(masks) 79 | 80 | # Save outputs to disk 81 | output_path = os.path.join(os.path.expandvars(config.label_root), 82 | sample_data['token'] + '.png') 83 | Image.fromarray(labels.astype(np.int32), mode='I').save(output_path) 84 | 85 | 86 | def load_map_data(dataroot, location): 87 | 88 | # Load the NuScenes map object 89 | nusc_map = NuScenesMap(dataroot, location) 90 | 91 | map_data = OrderedDict() 92 | for layer in nusc_utils.STATIC_CLASSES: 93 | 94 | # Retrieve all data associated with the current layer 95 | records = getattr(nusc_map, layer) 96 | polygons = list() 97 | 98 | # Drivable area records can contain multiple polygons 99 | if layer == 'drivable_area': 100 | for record in records: 101 | 102 | # Convert each entry in the record into a shapely object 103 | for token in record['polygon_tokens']: 104 | poly = nusc_map.extract_polygon(token) 105 | if poly.is_valid: 106 | polygons.append(poly) 107 | else: 108 | for record in records: 109 | 110 | # Convert each entry in the record into a shapely object 111 | poly = nusc_map.extract_polygon(record['polygon_token']) 112 | if poly.is_valid: 113 | polygons.append(poly) 114 | 115 | 116 | # Store as an R-Tree for fast intersection queries 117 | map_data[layer] = STRtree(polygons) 118 | 119 | return map_data 120 | 121 | 122 | 123 | 124 | 125 | if __name__ == '__main__': 126 | 127 | # Load the default configuration 128 | config = get_default_configuration() 129 | config.merge_from_file('configs/datasets/nuscenes.yml') 130 | 131 | # Load NuScenes dataset 132 | dataroot = os.path.expandvars(config.dataroot) 133 | nuscenes = NuScenes(config.nuscenes_version, dataroot) 134 | 135 | # Preload NuScenes map data 136 | map_data = { location : load_map_data(dataroot, location) 137 | for location in nusc_utils.LOCATIONS } 138 | 139 | # Create a directory for the generated labels 140 | output_root = os.path.expandvars(config.label_root) 141 | os.makedirs(output_root, exist_ok=True) 142 | 143 | # print(nuscenes.scene) 144 | # Iterate over NuScene scenes 145 | print("\nGenerating labels...") 146 | for scene in tqdm(nuscenes.scene): 147 | process_scene(nuscenes, map_data, scene, config) 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tom-roddick/mono-semantic-maps/f6b0f52857a4e622ae42bf24faaf9681cc089f76/src/__init__.py -------------------------------------------------------------------------------- /src/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tom-roddick/mono-semantic-maps/f6b0f52857a4e622ae42bf24faaf9681cc089f76/src/data/__init__.py -------------------------------------------------------------------------------- /src/data/argoverse/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tom-roddick/mono-semantic-maps/f6b0f52857a4e622ae42bf24faaf9681cc089f76/src/data/argoverse/__init__.py -------------------------------------------------------------------------------- /src/data/argoverse/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | from PIL import Image 3 | import torch 4 | from torch.utils.data import Dataset 5 | from torchvision.transforms.functional import to_tensor 6 | from argoverse.data_loading.argoverse_tracking_loader \ 7 | import ArgoverseTrackingLoader 8 | from argoverse.utils.camera_stats import RING_CAMERA_LIST 9 | 10 | from .utils import IMAGE_WIDTH, IMAGE_HEIGHT, ARGOVERSE_CLASS_NAMES 11 | from ..utils import decode_binary_labels 12 | 13 | 14 | class ArgoverseMapDataset(Dataset): 15 | 16 | def __init__(self, argo_loaders, label_root, image_size=[960, 600], 17 | log_names=None): 18 | 19 | self.label_root = label_root 20 | self.image_size = image_size 21 | 22 | self.examples = dict() 23 | self.calibs = dict() 24 | 25 | # Preload training examples from Argoverse train and test sets 26 | self.loaders = argo_loaders 27 | for split, loader in loaders.items(): 28 | self.preload(split, loader, log_names) 29 | 30 | 31 | def preload(self, split, loader, log_names=None): 32 | 33 | # Iterate over sequences 34 | for log in loader: 35 | 36 | # Check if the log is within the current dataset split 37 | logid = log.current_log 38 | if log_names is not None and logid not in log_names: 39 | continue 40 | 41 | self.calibs[logid] = dict() 42 | for camera, timestamps in log.image_timestamp_list_sync.items(): 43 | 44 | if camera not in RING_CAMERA_LIST: 45 | continue 46 | 47 | # Load image paths 48 | for timestamp in timestamps: 49 | self.examples[timestamp] = (split, logid, camera) 50 | 51 | 52 | def __len__(self): 53 | return len(self.examples) 54 | 55 | 56 | def __getitem__(self, timestamp): 57 | 58 | # Get the split, log and camera ids corresponding to the given timestamp 59 | split, log, camera = self.examples[timestamp] 60 | 61 | image = self.load_image(split, log, camera, timestamp) 62 | calib = self.load_calib(split, log, camera) 63 | labels, mask = self.load_labels(split, log, camera, timestamp) 64 | 65 | return image, calib, labels, mask 66 | 67 | 68 | def load_image(self, split, log, camera, timestamp): 69 | 70 | # Load image 71 | loader = self.loaders[split] 72 | image = loader.get_image_at_timestamp(timestamp, camera, log) 73 | 74 | # Resize to the desired dimensions 75 | image = image.resize(self.image_size) 76 | 77 | return to_tensor(image) 78 | 79 | 80 | def load_calib(self, split, log, camera): 81 | 82 | # Get the loader for the current split 83 | loader = self.loaders[split] 84 | 85 | # Get intrinsics matrix and rescale to account for downsampling 86 | calib = loader.get_calibration(camera, log).K[:,:3] 87 | calib[0] *= self.image_size[0] / IMAGE_WIDTH 88 | calib[1] *= self.image_size[1] / IMAGE_HEIGHT 89 | 90 | # Convert to a torch tensor 91 | return torch.from_numpy(calib) 92 | 93 | 94 | def load_labels(self, split, log, camera, timestamp): 95 | 96 | # Construct label path from example data 97 | label_path = os.path.join(self.label_root, split, log, camera, 98 | timestamp, f'{camera}_{timestamp}.png') 99 | 100 | # Load encoded label image as a torch tensor 101 | encoded_labels = to_tensor(Image.open(label_path)).long() 102 | 103 | # Decode to binary labels 104 | num_class = len(ARGOVERSE_CLASS_NAMES) 105 | labels = decode_binary_labels(encoded_labels, num_class+ 1) 106 | labels, mask = labels[:-1], ~labels[-1] 107 | 108 | return labels, mask 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | -------------------------------------------------------------------------------- /src/data/argoverse/splits.py: -------------------------------------------------------------------------------- 1 | 2 | TRAIN_LOGS = [ 3 | "26d141ec-f952-3908-b4cc-ae359377424e", 4 | "10b8dee6-778f-33e4-a946-d842d2d9c3d7", 5 | "273c1883-673a-36bf-b124-88311b1a80be", 6 | "f3fb839e-0aa2-342b-81c3-312b80be44f9", 7 | "a073e840-6319-3f0b-843e-f6dccdcc7b77", 8 | "c6911883-1843-3727-8eaa-41dc8cda8993", 9 | "dcdcd8b3-0ba1-3218-b2ea-7bb965aad3f0", 10 | "043aeba7-14e5-3cde-8a5c-639389b6d3a6", 11 | "230970eb-dc2e-3133-b252-ff3c6f5d4284", 12 | "b3def699-884b-3c9e-87e1-1ab76c618e0b", 13 | "e17eed4f-3ffd-3532-ab89-41a3f24cf226", 14 | "8a15674a-ae5c-38e2-bc4b-f4156d384072", 15 | "11953248-1195-1195-1195-511954366464", 16 | "3d20ae25-5b29-320d-8bae-f03e9dc177b9", 17 | "10b3a1d8-e56c-38be-aaf7-ef2f862a5c4e", 18 | "02cf0ce1-699a-373b-86c0-eb6fd5f4697a", 19 | "08a8b7f0-c317-3bdb-b3dc-b7c9b6d033e2", 20 | "15c802a9-0f0e-3c87-b516-a3fa02f1ecb0", 21 | "0ef28d5c-ae34-370b-99e7-6709e1c4b929", 22 | "22160544-2216-2216-2216-722161741824", 23 | "38b2c7ef-069b-3d9d-bbeb-8847b8c89fb6", 24 | "45753856-4575-4575-4575-345754906624", 25 | "53037376-5303-5303-5303-553038557184", 26 | "5c251c22-11b2-3278-835c-0cf3cdee3f44", 27 | "74750688-7475-7475-7475-474752397312", 28 | "75756160-7575-7575-7575-675757273088", 29 | "bae67a44-0f30-30c1-8999-06fc1c7ab80a", 30 | "e8ce69b2-36ab-38e8-87a4-b9e20fee7fd2", 31 | "ebe7a98b-d383-343b-96d6-9e681e2c6a36", 32 | "f0826a9f-f46e-3c27-97af-87a77f7899cd", 33 | "fa0b626f-03df-35a0-8447-021088814b8b", 34 | "10f92308-e06e-3725-a302-4b09e6e790ad", 35 | "29789600-2979-2979-2979-429790834688", 36 | "6162d72f-2990-3a30-9bba-19bbd882985c", 37 | "649750f3-0163-34eb-a102-7aaf5384eaec", 38 | "6c739f57-96d0-33e6-972d-af29cc527e1f", 39 | "95731808-9573-9573-9573-295732883456", 40 | "a6cab660-f086-3e2a-8ad9-7144f93f5b68", 41 | "aebe6aaa-6a95-39e6-9a8d-06103141fcde", 42 | "af706af1-a226-3f6f-8d65-b1f4b9457c48", 43 | "e9bb51af-1112-34c2-be3e-7ebe826649b4", 44 | "ff78e1a3-6deb-34a4-9a1f-b85e34980f06", 45 | "2bc6a872-9979-3493-82eb-fb55407473c9", 46 | "2c07fcda-6671-3ac0-ac23-4a232e0e031e", 47 | "49d66e75-3ce6-316b-b589-f659c7ef5e6d", 48 | "91326240-9132-9132-9132-591327440896", 49 | "5ab2697b-6e3e-3454-a36a-aba2c6f27818", 50 | "e9a96218-365b-3ecd-a800-ed2c4c306c78", 51 | "cb0cba51-dfaf-34e9-a0c2-d931404c3dd8", 52 | "b1ca08f1-24b0-3c39-ba4e-d5a92868462c", 53 | "1d676737-4110-3f7e-bec0-0c90f74c248f", 54 | "da734d26-8229-383f-b685-8086e58d1e05", 55 | "cd5bb988-092e-396c-8f33-e30969c98535", 56 | "f9fa3960-537f-3151-a1a3-37a9c0d6d7f7", 57 | "aeb73d7a-8257-3225-972e-99307b3a5cb0", 58 | "39556000-3955-3955-3955-039557148672", 59 | "c9d6ebeb-be15-3df8-b6f1-5575bea8e6b9", 60 | "5f317f5f-3ce9-355b-acf9-386a8c682252", 61 | "64724064-6472-6472-6472-764725145600", 62 | "cd64733a-dd8a-3bdf-b46a-b7144226168a", 63 | "6db21fda-80cd-3f85-b4a7-0aadeb14724d", 64 | "f1008c18-e76e-3c24-adcc-da9858fac145", 65 | "00c561b9-2057-358d-82c6-5b06d76cebcf", 66 | "cb762bb1-7ce1-3ba5-b53d-13c159b532c8", 67 | "2d12da1d-5238-3870-bfbc-b281d5e8c1a1" 68 | ] 69 | 70 | 71 | VAL_LOGS = [ 72 | "6f153f9c-edc5-389f-ac6f-40705c30d97e", 73 | "25952736-2595-2595-2595-225953853440", 74 | "88538208-8853-8853-8853-388539396096", 75 | "84c35ea7-1a99-3a0c-a3ea-c5915d68acbc", 76 | "64c12551-adb9-36e3-a0c1-e43a0e9f3845", 77 | "3138907e-1f8a-362f-8f3d-773f795a0d01", 78 | "4137e94a-c5da-38bd-ad06-6d57b24bccd0", 79 | "53213cf0-540b-3b5a-9900-d24d1d41bda0", 80 | "577ea60d-7cc0-34a4-a8ff-0401e5ab9c62", 81 | "d60558d2-d1aa-34ee-a902-e061e346e02a", 82 | "fb471bd6-7c81-3d93-ad12-ac54a28beb84", 83 | "52af191b-ba56-326c-b569-e37790db40f3", 84 | "919be600-da69-3f09-b0fd-f42f7eb2e097", 85 | "99c45b6e-6fc7-39b8-80d7-727c485fb561", 86 | "9da4ca63-f524-3b38-8c8b-624f17518574", 87 | "ba067318-0d89-34b5-b577-b171b1a4212b", 88 | "cd38ac0b-c5a6-3743-a148-f4f7b804ed17", 89 | "d4d9e91f-0f8e-334d-bd0e-0d062467308a", 90 | "de777454-df62-3d5a-a1ce-2edb5e5d4922", 91 | "70d2aea5-dbeb-333d-b21e-76a7f2f1ba1c", 92 | "033669d3-3d6b-3d3d-bd93-7985d86653ea", 93 | "7d37fc6b-1028-3f6f-b980-adb5fa73021e", 94 | "33737504-3373-3373-3373-633738571776", 95 | "85bc130b-97ae-37fb-a129-4fc07c80cca7" 96 | ] -------------------------------------------------------------------------------- /src/data/argoverse/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.ndimage import affine_transform 3 | from ..utils import render_polygon 4 | 5 | 6 | # Define Argoverse-specific constants 7 | IMAGE_WIDTH = 1920 8 | IMAGE_HEIGHT = 1200 9 | 10 | ARGOVERSE_CLASS_NAMES = [ 11 | 'drivable_area', 'vehicle', 'pedestrian', 'large_vehicle', 'bicycle', 'bus', 12 | 'trailer', 'motorcycle', 13 | ] 14 | 15 | ARGOVERSE_CLASS_MAPPING = { 16 | 'VEHICLE' : 'vehicle', 17 | 'PEDESTRIAN' : 'pedestrian', 18 | # 'ON_ROAD_OBSTACLE' : 'ignore', 19 | 'LARGE_VEHICLE' : 'large_vehicle', 20 | 'BICYCLE' : 'bicycle', 21 | 'BICYCLIST' : 'bicycle', 22 | 'BUS' : 'bus', 23 | # 'OTHER_MOVER' : 'ignore', 24 | 'TRAILER' : 'trailer', 25 | 'MOTORCYCLIST' : 'motorcycle', 26 | 'MOPED' : 'motorcycle', 27 | 'MOTORCYCLE' : 'motorcycle', 28 | # 'STROLLER' : 'ignore', 29 | 'EMERGENCY_VEHICLE' : 'vehicle', 30 | # 'ANIMAL' : 'ignore', 31 | } 32 | 33 | def argoverse_name_to_class_id(name): 34 | if name in ARGOVERSE_CLASS_MAPPING: 35 | return ARGOVERSE_CLASS_NAMES.index(ARGOVERSE_CLASS_MAPPING[name]) 36 | else: 37 | return -1 38 | 39 | 40 | def get_object_masks(scene, camera, frame, extents, resolution): 41 | 42 | # Get the dimensions of the birds-eye-view mask 43 | x1, z1, x2, z2 = extents 44 | mask_width = int((x2 - x1) / resolution) 45 | mask_height = int((z2 - z1) / resolution) 46 | 47 | # Initialise masks 48 | num_class = len(ARGOVERSE_CLASS_NAMES) 49 | masks = np.zeros((num_class + 1, mask_height, mask_width), dtype=np.uint8) 50 | 51 | # Get calibration information 52 | calib = scene.get_calibration(camera) 53 | 54 | # Iterate over objects in the scene 55 | for obj in scene.get_label_object(frame): 56 | 57 | # Get the bounding box and convert into camera coordinates 58 | bbox = obj.as_2d_bbox()[[0, 1, 3, 2]] 59 | cam_bbox = calib.project_ego_to_cam(bbox)[:, [0, 2]] 60 | 61 | # Render the bounding box to the appropriate mask layer 62 | class_id = argoverse_name_to_class_id(obj.label_class) 63 | render_bbox(masks[class_id], cam_bbox, extents, resolution) 64 | 65 | return masks.astype(np.bool) 66 | 67 | 68 | def get_map_mask(scene, camera, frame, map_data, extents, resolution): 69 | 70 | # Get the dimensions of the birds-eye-view mask 71 | x1, z1, x2, z2 = extents 72 | mask_width = int((x2 - x1) / resolution) 73 | mask_height = int((z2 - z1) / resolution) 74 | 75 | # Get rasterised map 76 | city_mask, map_tfm = map_data.get_rasterized_driveable_area(scene.city_name) 77 | 78 | # Get 3D transform from camera to world coordinates 79 | extrinsic = scene.get_calibration(camera).extrinsic 80 | pose = scene.get_pose(frame).transform_matrix 81 | cam_to_world_tfm = np.matmul(pose, np.linalg.inv(extrinsic)) 82 | 83 | # Get 2D affine transform from camera to map coordinates 84 | cam_to_map_tfm = np.matmul(map_tfm, cam_to_world_tfm[[0, 1, 3]]) 85 | 86 | # Get 2D affine transform from BEV coords to map coords 87 | bev_to_cam_tfm = np.array([[resolution, 0, x1], 88 | [0, resolution, z1], 89 | [0, 0, 1]]) 90 | bev_to_map_tfm = np.matmul(cam_to_map_tfm[:, [0, 2, 3]], bev_to_cam_tfm) 91 | 92 | # Warp map image to bev coordinate system 93 | mask = affine_transform(city_mask, bev_to_map_tfm[[1, 0]], 94 | output_shape=(mask_width, mask_height)).T 95 | return mask[None] 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /src/data/augmentation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset 3 | 4 | class AugmentedMapDataset(Dataset): 5 | 6 | def __init__(self, dataset, hflip=True): 7 | self.dataset = dataset 8 | self.hflip = hflip 9 | 10 | def __len__(self): 11 | return len(self.dataset) 12 | 13 | def __getitem__(self, index): 14 | image, calib, labels, mask = self.dataset[index] 15 | 16 | # Apply data augmentation 17 | if self.hflip: 18 | image, labels, mask = random_hflip(image, labels, mask) 19 | 20 | return image, calib, labels, mask 21 | 22 | 23 | def random_hflip(image, labels, mask): 24 | image = torch.flip(image, (-1,)) 25 | labels = torch.flip(labels.int(), (-1,)).bool() 26 | mask = torch.flip(mask.int(), (-1,)).bool() 27 | return image, labels, mask -------------------------------------------------------------------------------- /src/data/data_factory.py: -------------------------------------------------------------------------------- 1 | import os 2 | from torch.utils.data import DataLoader, RandomSampler 3 | from .augmentation import AugmentedMapDataset 4 | 5 | from nuscenes import NuScenes 6 | from .nuscenes.dataset import NuScenesMapDataset 7 | from .nuscenes.splits import TRAIN_SCENES, VAL_SCENES, CALIBRATION_SCENES 8 | 9 | from argoverse.data_loading.argoverse_tracking_loader import ArgoverseTrackingLoader 10 | from .argoverse.dataset import ArgoverseMapDataset 11 | from .argoverse.splits import TRAIN_LOGS, VAL_LOGS 12 | 13 | 14 | def build_nuscenes_datasets(config): 15 | print('==> Loading NuScenes dataset...') 16 | nuscenes = NuScenes(config.nuscenes_version, 17 | os.path.expandvars(config.dataroot)) 18 | 19 | # Exclude calibration scenes 20 | if config.hold_out_calibration: 21 | train_scenes = list(set(TRAIN_SCENES) - set(CALIBRATION_SCENES)) 22 | else: 23 | train_scenes = TRAIN_SCENES 24 | 25 | train_data = NuScenesMapDataset(nuscenes, config.label_root, 26 | config.img_size, train_scenes) 27 | val_data = NuScenesMapDataset(nuscenes, config.label_root, 28 | config.img_size, VAL_SCENES) 29 | return train_data, val_data 30 | 31 | 32 | def build_argoverse_datasets(config): 33 | print('==> Loading Argoverse dataset...') 34 | dataroot = os.path.expandvars(config.dataroot) 35 | 36 | # Load native argoverse splits 37 | loaders = { 38 | 'train' : ArgoverseTrackingLoader(os.path.join(dataroot, 'train')), 39 | 'val' : ArgoverseTrackingLoader(os.path.join(dataroot, 'val')) 40 | } 41 | 42 | # Create datasets using new argoverse splits 43 | train_data = ArgoverseMapDataset(loaders, config.label_root, 44 | config.img_size, TRAIN_LOGS) 45 | val_data = ArgoverseMapDataset(loaders, config.label_root, 46 | config.img_size, VAL_LOGS) 47 | return train_data, val_data 48 | 49 | 50 | def build_datasets(dataset_name, config): 51 | if dataset_name == 'nuscenes': 52 | return build_nuscenes_datasets(config) 53 | elif dataset_name == 'argoverse': 54 | return build_argoverse_datasets(config) 55 | else: 56 | raise ValueError(f"Unknown dataset option '{dataset_name}'") 57 | 58 | 59 | 60 | def build_trainval_datasets(dataset_name, config): 61 | 62 | # Construct the base dataset 63 | train_data, val_data = build_datasets(dataset_name, config) 64 | 65 | # Add data augmentation to train dataset 66 | train_data = AugmentedMapDataset(train_data, config.hflip) 67 | 68 | return train_data, val_data 69 | 70 | 71 | def build_dataloaders(dataset_name, config): 72 | 73 | # Build training and validation datasets 74 | train_data, val_data = build_trainval_datasets(dataset_name, config) 75 | 76 | # Create training set dataloader 77 | sampler = RandomSampler(train_data, True, config.epoch_size) 78 | train_loader = DataLoader(train_data, config.batch_size, sampler=sampler, 79 | num_workers=config.num_workers) 80 | 81 | # Create validation dataloader 82 | val_loader = DataLoader(val_data, config.batch_size, 83 | num_workers=config.num_workers) 84 | 85 | return train_loader, val_loader 86 | 87 | 88 | 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /src/data/nuscenes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tom-roddick/mono-semantic-maps/f6b0f52857a4e622ae42bf24faaf9681cc089f76/src/data/nuscenes/__init__.py -------------------------------------------------------------------------------- /src/data/nuscenes/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from torch.utils.data import Dataset 4 | from PIL import Image, ImageFile 5 | from nuscenes import NuScenes 6 | from torchvision.transforms.functional import to_tensor 7 | 8 | from .utils import CAMERA_NAMES, NUSCENES_CLASS_NAMES, iterate_samples 9 | from ..utils import decode_binary_labels 10 | 11 | class NuScenesMapDataset(Dataset): 12 | 13 | def __init__(self, nuscenes, map_root, image_size=(800, 450), 14 | scene_names=None): 15 | 16 | self.nuscenes = nuscenes 17 | self.map_root = os.path.expandvars(map_root) 18 | self.image_size = image_size 19 | 20 | # Preload the list of tokens in the dataset 21 | self.get_tokens(scene_names) 22 | 23 | # Allow PIL to load partially corrupted images 24 | # (otherwise training crashes at the most inconvenient possible times!) 25 | ImageFile.LOAD_TRUNCATED_IMAGES = True 26 | 27 | 28 | def get_tokens(self, scene_names=None): 29 | 30 | self.tokens = list() 31 | 32 | # Iterate over scenes 33 | for scene in self.nuscenes.scene: 34 | 35 | # Ignore scenes which don't belong to the current split 36 | if scene_names is not None and scene['name'] not in scene_names: 37 | continue 38 | 39 | # Iterate over samples 40 | for sample in iterate_samples(self.nuscenes, 41 | scene['first_sample_token']): 42 | 43 | # Iterate over cameras 44 | for camera in CAMERA_NAMES: 45 | self.tokens.append(sample['data'][camera]) 46 | 47 | return self.tokens 48 | 49 | 50 | def __len__(self): 51 | return len(self.tokens) 52 | 53 | def __getitem__(self, index): 54 | token = self.tokens[index] 55 | 56 | image = self.load_image(token) 57 | calib = self.load_calib(token) 58 | labels, mask = self.load_labels(token) 59 | 60 | return image, calib, labels, mask 61 | 62 | 63 | def load_image(self, token): 64 | 65 | # Load image as a PIL image 66 | image = Image.open(self.nuscenes.get_sample_data_path(token)) 67 | 68 | # Resize to input resolution 69 | image = image.resize(self.image_size) 70 | 71 | # Convert to a torch tensor 72 | return to_tensor(image) 73 | 74 | 75 | def load_calib(self, token): 76 | 77 | # Load camera intrinsics matrix 78 | sample_data = self.nuscenes.get('sample_data', token) 79 | sensor = self.nuscenes.get( 80 | 'calibrated_sensor', sample_data['calibrated_sensor_token']) 81 | intrinsics = torch.tensor(sensor['camera_intrinsic']) 82 | 83 | # Scale calibration matrix to account for image downsampling 84 | intrinsics[0] *= self.image_size[0] / sample_data['width'] 85 | intrinsics[1] *= self.image_size[1] / sample_data['height'] 86 | return intrinsics 87 | 88 | 89 | def load_labels(self, token): 90 | 91 | # Load label image as a torch tensor 92 | label_path = os.path.join(self.map_root, token + '.png') 93 | encoded_labels = to_tensor(Image.open(label_path)).long() 94 | 95 | # Decode to binary labels 96 | num_class = len(NUSCENES_CLASS_NAMES) 97 | labels = decode_binary_labels(encoded_labels, num_class + 1) 98 | labels, mask = labels[:-1], ~labels[-1] 99 | 100 | return labels, mask 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /src/data/nuscenes/splits.py: -------------------------------------------------------------------------------- 1 | 2 | TRAIN_SCENES = [ 3 | "scene-0002", "scene-0003", "scene-0004", "scene-0005", "scene-0006", 4 | "scene-0007", "scene-0008", "scene-0009", "scene-0012", "scene-0013", 5 | "scene-0014", "scene-0015", "scene-0016", "scene-0017", "scene-0018", 6 | "scene-0019", "scene-0021", "scene-0022", "scene-0023", "scene-0024", 7 | "scene-0025", "scene-0026", "scene-0027", "scene-0028", "scene-0029", 8 | "scene-0030", "scene-0031", "scene-0032", "scene-0033", "scene-0034", 9 | "scene-0035", "scene-0036", "scene-0039", "scene-0042", "scene-0043", 10 | "scene-0044", "scene-0045", "scene-0046", "scene-0047", "scene-0048", 11 | "scene-0049", "scene-0050", "scene-0051", "scene-0052", "scene-0055", 12 | "scene-0056", "scene-0057", "scene-0058", "scene-0059", "scene-0060", 13 | "scene-0061", "scene-0062", "scene-0063", "scene-0064", "scene-0065", 14 | "scene-0066", "scene-0067", "scene-0068", "scene-0069", "scene-0070", 15 | "scene-0071", "scene-0072", "scene-0073", "scene-0074", "scene-0075", 16 | "scene-0076", "scene-0092", "scene-0093", "scene-0094", "scene-0095", 17 | "scene-0096", "scene-0097", "scene-0098", "scene-0099", "scene-0100", 18 | "scene-0101", "scene-0102", "scene-0103", "scene-0104", "scene-0105", 19 | "scene-0106", "scene-0107", "scene-0108", "scene-0109", "scene-0110", 20 | "scene-0120", "scene-0123", "scene-0124", "scene-0125", "scene-0126", 21 | "scene-0127", "scene-0128", "scene-0129", "scene-0130", "scene-0131", 22 | "scene-0132", "scene-0133", "scene-0134", "scene-0135", "scene-0138", 23 | "scene-0149", "scene-0150", "scene-0151", "scene-0154", "scene-0155", 24 | "scene-0157", "scene-0158", "scene-0159", "scene-0161", "scene-0162", 25 | "scene-0163", "scene-0164", "scene-0165", "scene-0166", "scene-0167", 26 | "scene-0168", "scene-0170", "scene-0171", "scene-0172", "scene-0173", 27 | "scene-0174", "scene-0175", "scene-0176", "scene-0177", "scene-0178", 28 | "scene-0179", "scene-0180", "scene-0181", "scene-0182", "scene-0183", 29 | "scene-0185", "scene-0187", "scene-0188", "scene-0190", "scene-0191", 30 | "scene-0192", "scene-0193", "scene-0194", "scene-0195", "scene-0196", 31 | "scene-0199", "scene-0200", "scene-0202", "scene-0203", "scene-0204", 32 | "scene-0206", "scene-0207", "scene-0208", "scene-0209", "scene-0210", 33 | "scene-0211", "scene-0212", "scene-0213", "scene-0214", "scene-0218", 34 | "scene-0219", "scene-0220", "scene-0221", "scene-0222", "scene-0224", 35 | "scene-0225", "scene-0226", "scene-0227", "scene-0228", "scene-0229", 36 | "scene-0230", "scene-0231", "scene-0232", "scene-0233", "scene-0234", 37 | "scene-0235", "scene-0236", "scene-0237", "scene-0238", "scene-0239", 38 | "scene-0240", "scene-0241", "scene-0242", "scene-0243", "scene-0244", 39 | "scene-0245", "scene-0246", "scene-0247", "scene-0248", "scene-0249", 40 | "scene-0250", "scene-0251", "scene-0252", "scene-0253", "scene-0254", 41 | "scene-0255", "scene-0256", "scene-0257", "scene-0258", "scene-0259", 42 | "scene-0260", "scene-0261", "scene-0262", "scene-0263", "scene-0264", 43 | "scene-0268", "scene-0270", "scene-0271", "scene-0272", "scene-0273", 44 | "scene-0274", "scene-0275", "scene-0276", "scene-0277", "scene-0278", 45 | "scene-0283", "scene-0284", "scene-0285", "scene-0286", "scene-0287", 46 | "scene-0288", "scene-0289", "scene-0290", "scene-0291", "scene-0292", 47 | "scene-0293", "scene-0294", "scene-0295", "scene-0296", "scene-0297", 48 | "scene-0298", "scene-0299", "scene-0300", "scene-0301", "scene-0302", 49 | "scene-0303", "scene-0304", "scene-0305", "scene-0306", "scene-0315", 50 | "scene-0316", "scene-0317", "scene-0318", "scene-0321", "scene-0323", 51 | "scene-0324", "scene-0328", "scene-0329", "scene-0330", "scene-0331", 52 | "scene-0332", "scene-0344", "scene-0345", "scene-0346", "scene-0349", 53 | "scene-0350", "scene-0351", "scene-0352", "scene-0353", "scene-0354", 54 | "scene-0355", "scene-0356", "scene-0357", "scene-0358", "scene-0359", 55 | "scene-0360", "scene-0361", "scene-0362", "scene-0363", "scene-0364", 56 | "scene-0365", "scene-0367", "scene-0370", "scene-0371", "scene-0372", 57 | "scene-0373", "scene-0374", "scene-0375", "scene-0376", "scene-0377", 58 | "scene-0379", "scene-0380", "scene-0381", "scene-0382", "scene-0383", 59 | "scene-0384", "scene-0385", "scene-0386", "scene-0388", "scene-0399", 60 | "scene-0400", "scene-0401", "scene-0402", "scene-0403", "scene-0405", 61 | "scene-0406", "scene-0407", "scene-0408", "scene-0420", "scene-0421", 62 | "scene-0422", "scene-0423", "scene-0424", "scene-0425", "scene-0426", 63 | "scene-0427", "scene-0428", "scene-0429", "scene-0430", "scene-0431", 64 | "scene-0432", "scene-0433", "scene-0434", "scene-0435", "scene-0436", 65 | "scene-0437", "scene-0438", "scene-0439", "scene-0440", "scene-0441", 66 | "scene-0442", "scene-0443", "scene-0444", "scene-0445", "scene-0446", 67 | "scene-0447", "scene-0448", "scene-0449", "scene-0450", "scene-0451", 68 | "scene-0452", "scene-0453", "scene-0454", "scene-0455", "scene-0456", 69 | "scene-0457", "scene-0458", "scene-0459", "scene-0461", "scene-0462", 70 | "scene-0463", "scene-0464", "scene-0465", "scene-0467", "scene-0468", 71 | "scene-0469", "scene-0471", "scene-0472", "scene-0474", "scene-0475", 72 | "scene-0476", "scene-0477", "scene-0478", "scene-0479", "scene-0480", 73 | "scene-0499", "scene-0500", "scene-0501", "scene-0502", "scene-0504", 74 | "scene-0505", "scene-0506", "scene-0507", "scene-0508", "scene-0509", 75 | "scene-0510", "scene-0511", "scene-0512", "scene-0513", "scene-0514", 76 | "scene-0515", "scene-0517", "scene-0518", "scene-0519", "scene-0520", 77 | "scene-0521", "scene-0522", "scene-0523", "scene-0524", "scene-0552", 78 | "scene-0553", "scene-0554", "scene-0555", "scene-0559", "scene-0560", 79 | "scene-0561", "scene-0562", "scene-0563", "scene-0564", "scene-0565", 80 | "scene-0584", "scene-0585", "scene-0586", "scene-0587", "scene-0588", 81 | "scene-0589", "scene-0590", "scene-0591", "scene-0592", "scene-0593", 82 | "scene-0594", "scene-0595", "scene-0596", "scene-0597", "scene-0598", 83 | "scene-0599", "scene-0600", "scene-0625", "scene-0626", "scene-0627", 84 | "scene-0629", "scene-0630", "scene-0632", "scene-0633", "scene-0634", 85 | "scene-0635", "scene-0636", "scene-0637", "scene-0638", "scene-0639", 86 | "scene-0640", "scene-0652", "scene-0653", "scene-0654", "scene-0655", 87 | "scene-0656", "scene-0657", "scene-0658", "scene-0659", "scene-0660", 88 | "scene-0661", "scene-0662", "scene-0663", "scene-0664", "scene-0665", 89 | "scene-0666", "scene-0667", "scene-0668", "scene-0669", "scene-0670", 90 | "scene-0671", "scene-0672", "scene-0673", "scene-0674", "scene-0675", 91 | "scene-0676", "scene-0677", "scene-0678", "scene-0679", "scene-0681", 92 | "scene-0683", "scene-0684", "scene-0685", "scene-0686", "scene-0687", 93 | "scene-0688", "scene-0689", "scene-0695", "scene-0696", "scene-0697", 94 | "scene-0698", "scene-0700", "scene-0701", "scene-0703", "scene-0704", 95 | "scene-0705", "scene-0706", "scene-0707", "scene-0708", "scene-0709", 96 | "scene-0710", "scene-0711", "scene-0712", "scene-0713", "scene-0714", 97 | "scene-0715", "scene-0716", "scene-0717", "scene-0718", "scene-0719", 98 | "scene-0726", "scene-0727", "scene-0728", "scene-0730", "scene-0731", 99 | "scene-0733", "scene-0734", "scene-0735", "scene-0736", "scene-0737", 100 | "scene-0738", "scene-0780", "scene-0781", "scene-0782", "scene-0783", 101 | "scene-0784", "scene-0786", "scene-0787", "scene-0789", "scene-0790", 102 | "scene-0791", "scene-0792", "scene-0802", "scene-0806", "scene-0808", 103 | "scene-0809", "scene-0810", "scene-0811", "scene-0812", "scene-0813", 104 | "scene-0815", "scene-0816", "scene-0817", "scene-0819", "scene-0820", 105 | "scene-0821", "scene-0822", "scene-0847", "scene-0848", "scene-0849", 106 | "scene-0850", "scene-0851", "scene-0852", "scene-0853", "scene-0854", 107 | "scene-0855", "scene-0856", "scene-0858", "scene-0860", "scene-0861", 108 | "scene-0862", "scene-0863", "scene-0864", "scene-0865", "scene-0866", 109 | "scene-0868", "scene-0869", "scene-0870", "scene-0871", "scene-0872", 110 | "scene-0873", "scene-0875", "scene-0876", "scene-0877", "scene-0878", 111 | "scene-0880", "scene-0882", "scene-0883", "scene-0884", "scene-0885", 112 | "scene-0886", "scene-0887", "scene-0888", "scene-0889", "scene-0890", 113 | "scene-0891", "scene-0892", "scene-0893", "scene-0894", "scene-0895", 114 | "scene-0896", "scene-0897", "scene-0898", "scene-0899", "scene-0900", 115 | "scene-0901", "scene-0902", "scene-0903", "scene-0904", "scene-0905", 116 | "scene-0906", "scene-0907", "scene-0908", "scene-0909", "scene-0916", 117 | "scene-0917", "scene-0921", "scene-0922", "scene-0923", "scene-0925", 118 | "scene-0926", "scene-0927", "scene-0928", "scene-0929", "scene-0930", 119 | "scene-0931", "scene-0945", "scene-0947", "scene-0949", "scene-0952", 120 | "scene-0953", "scene-0955", "scene-0956", "scene-0957", "scene-0958", 121 | "scene-0959", "scene-0960", "scene-0961", "scene-0966", "scene-0967", 122 | "scene-0968", "scene-0969", "scene-0971", "scene-0972", "scene-0975", 123 | "scene-0976", "scene-0977", "scene-0978", "scene-0979", "scene-0980", 124 | "scene-0981", "scene-0982", "scene-0983", "scene-0984", "scene-0988", 125 | "scene-0989", "scene-0990", "scene-0991", "scene-0992", "scene-0994", 126 | "scene-0995", "scene-0996", "scene-0997", "scene-0998", "scene-0999", 127 | "scene-1000", "scene-1001", "scene-1004", "scene-1005", "scene-1006", 128 | "scene-1007", "scene-1008", "scene-1009", "scene-1010", "scene-1011", 129 | "scene-1012", "scene-1013", "scene-1014", "scene-1015", "scene-1019", 130 | "scene-1020", "scene-1021", "scene-1022", "scene-1023", "scene-1024", 131 | "scene-1025", "scene-1044", "scene-1045", "scene-1046", "scene-1047", 132 | "scene-1048", "scene-1049", "scene-1050", "scene-1051", "scene-1052", 133 | "scene-1053", "scene-1054", "scene-1064", "scene-1065", "scene-1066", 134 | "scene-1067", "scene-1068", "scene-1069", "scene-1070", "scene-1071", 135 | "scene-1072", "scene-1073", "scene-1074", "scene-1075", "scene-1076", 136 | "scene-1077", "scene-1078", "scene-1079", "scene-1080", "scene-1081", 137 | "scene-1082", "scene-1083", "scene-1084", "scene-1085", "scene-1086", 138 | "scene-1087", "scene-1088", "scene-1089", "scene-1090", "scene-1091", 139 | "scene-1092", "scene-1093", "scene-1094", "scene-1095", "scene-1096", 140 | "scene-1097", "scene-1098", "scene-1099", "scene-1100", "scene-1101", 141 | "scene-1102", "scene-1104", "scene-1105", "scene-1106", "scene-1107", 142 | "scene-1108", "scene-1109", "scene-1110"] 143 | 144 | VAL_SCENES = [ 145 | "scene-0001", "scene-0010", "scene-0011", "scene-0020", "scene-0038", 146 | "scene-0041", "scene-0053", "scene-0054", "scene-0121", "scene-0122", 147 | "scene-0139", "scene-0152", "scene-0160", "scene-0184", "scene-0269", 148 | "scene-0347", "scene-0348", "scene-0366", "scene-0368", "scene-0369", 149 | "scene-0378", "scene-0389", "scene-0390", "scene-0391", "scene-0392", 150 | "scene-0393", "scene-0394", "scene-0395", "scene-0396", "scene-0397", 151 | "scene-0398", "scene-0411", "scene-0412", "scene-0413", "scene-0414", 152 | "scene-0415", "scene-0416", "scene-0417", "scene-0418", "scene-0419", 153 | "scene-0525", "scene-0526", "scene-0527", "scene-0528", "scene-0529", 154 | "scene-0530", "scene-0531", "scene-0532", "scene-0533", "scene-0534", 155 | "scene-0535", "scene-0536", "scene-0537", "scene-0538", "scene-0539", 156 | "scene-0541", "scene-0542", "scene-0543", "scene-0544", "scene-0545", 157 | "scene-0546", "scene-0556", "scene-0557", "scene-0558", "scene-0566", 158 | "scene-0568", "scene-0570", "scene-0571", "scene-0572", "scene-0573", 159 | "scene-0574", "scene-0575", "scene-0576", "scene-0577", "scene-0578", 160 | "scene-0580", "scene-0582", "scene-0583", "scene-0642", "scene-0643", 161 | "scene-0644", "scene-0645", "scene-0646", "scene-0647", "scene-0648", 162 | "scene-0649", "scene-0650", "scene-0651", "scene-0739", "scene-0740", 163 | "scene-0741", "scene-0744", "scene-0746", "scene-0747", "scene-0749", 164 | "scene-0750", "scene-0751", "scene-0752", "scene-0757", "scene-0758", 165 | "scene-0759", "scene-0760", "scene-0761", "scene-0762", "scene-0763", 166 | "scene-0764", "scene-0765", "scene-0767", "scene-0768", "scene-0769", 167 | "scene-0770", "scene-0771", "scene-0775", "scene-0777", "scene-0778", 168 | "scene-0794", "scene-0795", "scene-0796", "scene-0797", "scene-0798", 169 | "scene-0799", "scene-0800", "scene-0803", "scene-0804", "scene-0911", 170 | "scene-0912", "scene-0913", "scene-0914", "scene-0915", "scene-0919", 171 | "scene-0920", "scene-0924", "scene-0962", "scene-0963", "scene-1002", 172 | "scene-1003", "scene-1016", "scene-1017", "scene-1018", "scene-1055", 173 | "scene-1056", "scene-1057", "scene-1058", "scene-1059", "scene-1060", 174 | "scene-1061", "scene-1062", "scene-1063"] 175 | 176 | 177 | CALIBRATION_SCENES = [ 178 | "scene-0852", "scene-0429", "scene-0956", "scene-0194", "scene-0811", 179 | "scene-1110", "scene-1107", "scene-0294", "scene-0900", "scene-0596", 180 | "scene-0296", "scene-0885", "scene-0866", "scene-0105", "scene-0782", 181 | "scene-0191", "scene-0876", "scene-0133", "scene-0231", "scene-0847", 182 | "scene-0363", "scene-0026", "scene-0791", "scene-0909", "scene-0002", 183 | "scene-0283", "scene-0007", "scene-0251", "scene-1100", "scene-0668", 184 | "scene-0584", "scene-0287", "scene-0260", "scene-0171", "scene-0789", 185 | "scene-0108", "scene-0190", "scene-0206", "scene-0635", "scene-0815", 186 | "scene-0058", "scene-0710", "scene-0302", "scene-0639", "scene-0166", 187 | "scene-0094", "scene-0735", "scene-0321", "scene-1091", "scene-0344" 188 | ] -------------------------------------------------------------------------------- /src/data/nuscenes/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from shapely import geometry, affinity 4 | from pyquaternion import Quaternion 5 | 6 | from nuscenes.eval.detection.utils import category_to_detection_name 7 | from nuscenes.eval.detection.constants import DETECTION_NAMES 8 | from nuscenes.utils.data_classes import LidarPointCloud 9 | 10 | from ..utils import transform_polygon, render_polygon, transform 11 | 12 | CAMERA_NAMES = ['CAM_FRONT', 'CAM_FRONT_LEFT', 'CAM_FRONT_RIGHT', 13 | 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT', 'CAM_BACK'] 14 | 15 | NUSCENES_CLASS_NAMES = [ 16 | 'drivable_area', 'ped_crossing', 'walkway', 'carpark', 'car', 'truck', 17 | 'bus', 'trailer', 'construction_vehicle', 'pedestrian', 'motorcycle', 18 | 'bicycle', 'traffic_cone', 'barrier' 19 | ] 20 | 21 | STATIC_CLASSES = ['drivable_area', 'ped_crossing', 'walkway', 'carpark_area'] 22 | 23 | LOCATIONS = ['boston-seaport', 'singapore-onenorth', 'singapore-queenstown', 24 | 'singapore-hollandvillage'] 25 | 26 | 27 | def iterate_samples(nuscenes, start_token): 28 | sample_token = start_token 29 | while sample_token != '': 30 | sample = nuscenes.get('sample', sample_token) 31 | yield sample 32 | sample_token = sample['next'] 33 | 34 | 35 | def get_map_masks(nuscenes, map_data, sample_data, extents, resolution): 36 | 37 | # Render each layer sequentially 38 | layers = [get_layer_mask(nuscenes, polys, sample_data, extents, 39 | resolution) for layer, polys in map_data.items()] 40 | 41 | return np.stack(layers, axis=0) 42 | 43 | 44 | def get_layer_mask(nuscenes, polygons, sample_data, extents, resolution): 45 | 46 | # Get the 2D affine transform from bev coords to map coords 47 | tfm = get_sensor_transform(nuscenes, sample_data)[[0, 1, 3]][:, [0, 2, 3]] 48 | inv_tfm = np.linalg.inv(tfm) 49 | 50 | # Create a patch representing the birds-eye-view region in map coordinates 51 | map_patch = geometry.box(*extents) 52 | map_patch = transform_polygon(map_patch, tfm) 53 | 54 | # Initialise the map mask 55 | x1, z1, x2, z2 = extents 56 | mask = np.zeros((int((z2 - z1) / resolution), int((x2 - x1) / resolution)), 57 | dtype=np.uint8) 58 | 59 | # Find all polygons which intersect with the area of interest 60 | for polygon in polygons.query(map_patch): 61 | 62 | polygon = polygon.intersection(map_patch) 63 | 64 | # Transform into map coordinates 65 | polygon = transform_polygon(polygon, inv_tfm) 66 | 67 | # Render the polygon to the mask 68 | render_shapely_polygon(mask, polygon, extents, resolution) 69 | 70 | return mask.astype(np.bool) 71 | 72 | 73 | 74 | 75 | def get_object_masks(nuscenes, sample_data, extents, resolution): 76 | 77 | # Initialize object masks 78 | nclass = len(DETECTION_NAMES) + 1 79 | grid_width = int((extents[2] - extents[0]) / resolution) 80 | grid_height = int((extents[3] - extents[1]) / resolution) 81 | masks = np.zeros((nclass, grid_height, grid_width), dtype=np.uint8) 82 | 83 | # Get the 2D affine transform from bev coords to map coords 84 | tfm = get_sensor_transform(nuscenes, sample_data)[[0, 1, 3]][:, [0, 2, 3]] 85 | inv_tfm = np.linalg.inv(tfm) 86 | 87 | for box in nuscenes.get_boxes(sample_data['token']): 88 | 89 | # Get the index of the class 90 | det_name = category_to_detection_name(box.name) 91 | if det_name not in DETECTION_NAMES: 92 | class_id = -1 93 | else: 94 | class_id = DETECTION_NAMES.index(det_name) 95 | 96 | # Get bounding box coordinates in the grid coordinate frame 97 | bbox = box.bottom_corners()[:2] 98 | local_bbox = np.dot(inv_tfm[:2, :2], bbox).T + inv_tfm[:2, 2] 99 | 100 | # Render the rotated bounding box to the mask 101 | render_polygon(masks[class_id], local_bbox, extents, resolution) 102 | 103 | return masks.astype(np.bool) 104 | 105 | 106 | def get_sensor_transform(nuscenes, sample_data): 107 | 108 | # Load sensor transform data 109 | sensor = nuscenes.get( 110 | 'calibrated_sensor', sample_data['calibrated_sensor_token']) 111 | sensor_tfm = make_transform_matrix(sensor) 112 | 113 | # Load ego pose data 114 | pose = nuscenes.get('ego_pose', sample_data['ego_pose_token']) 115 | pose_tfm = make_transform_matrix(pose) 116 | 117 | return np.dot(pose_tfm, sensor_tfm) 118 | 119 | 120 | def load_point_cloud(nuscenes, sample_data): 121 | 122 | # Load point cloud 123 | lidar_path = os.path.join(nuscenes.dataroot, sample_data['filename']) 124 | pcl = LidarPointCloud.from_file(lidar_path) 125 | return pcl.points[:3, :].T 126 | 127 | 128 | def make_transform_matrix(record): 129 | """ 130 | Create a 4x4 transform matrix from a calibrated_sensor or ego_pose record 131 | """ 132 | transform = np.eye(4) 133 | transform[:3, :3] = Quaternion(record['rotation']).rotation_matrix 134 | transform[:3, 3] = np.array(record['translation']) 135 | return transform 136 | 137 | 138 | def render_shapely_polygon(mask, polygon, extents, resolution): 139 | 140 | if polygon.geom_type == 'Polygon': 141 | 142 | # Render exteriors 143 | render_polygon(mask, polygon.exterior.coords, extents, resolution, 1) 144 | 145 | # Render interiors 146 | for hole in polygon.interiors: 147 | render_polygon(mask, hole.coords, extents, resolution, 0) 148 | 149 | # Handle the case of compound shapes 150 | else: 151 | for poly in polygon: 152 | render_shapely_polygon(mask, poly, extents, resolution) 153 | 154 | 155 | 156 | 157 | -------------------------------------------------------------------------------- /src/data/utils.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import torch 4 | from shapely import affinity 5 | 6 | def decode_binary_labels(labels, nclass): 7 | bits = torch.pow(2, torch.arange(nclass)) 8 | return (labels & bits.view(-1, 1, 1)) > 0 9 | 10 | 11 | def encode_binary_labels(masks): 12 | bits = np.power(2, np.arange(len(masks), dtype=np.int32)) 13 | return (masks.astype(np.int32) * bits.reshape(-1, 1, 1)).sum(0) 14 | 15 | 16 | def transform(matrix, vectors): 17 | vectors = np.dot(matrix[:-1, :-1], vectors.T) 18 | vectors = vectors.T + matrix[:-1, -1] 19 | return vectors 20 | 21 | 22 | def transform_polygon(polygon, affine): 23 | """ 24 | Transform a 2D polygon 25 | """ 26 | a, b, tx, c, d, ty = affine.flatten()[:6] 27 | return affinity.affine_transform(polygon, [a, b, c, d, tx, ty]) 28 | 29 | 30 | def render_polygon(mask, polygon, extents, resolution, value=1): 31 | if len(polygon) == 0: 32 | return 33 | polygon = (polygon - np.array(extents[:2])) / resolution 34 | polygon = np.ascontiguousarray(polygon).round().astype(np.int32) 35 | cv2.fillConvexPoly(mask, polygon, value) 36 | 37 | 38 | def get_visible_mask(instrinsics, image_width, extents, resolution): 39 | 40 | # Get calibration parameters 41 | fu, cu = instrinsics[0, 0], instrinsics[0, 2] 42 | 43 | # Construct a grid of image coordinates 44 | x1, z1, x2, z2 = extents 45 | x, z = np.arange(x1, x2, resolution), np.arange(z1, z2, resolution) 46 | ucoords = x / z[:, None] * fu + cu 47 | 48 | # Return all points which lie within the camera bounds 49 | return (ucoords >= 0) & (ucoords < image_width) 50 | 51 | 52 | def get_occlusion_mask(points, extents, resolution): 53 | 54 | x1, z1, x2, z2 = extents 55 | 56 | # A 'ray' is defined by the ratio between x and z coordinates 57 | ray_width = resolution / z2 58 | ray_offset = x1 / ray_width 59 | max_rays = int((x2 - x1) / ray_width) 60 | 61 | # Group LiDAR points into bins 62 | rayid = np.round(points[:, 0] / points[:, 2] / ray_width - ray_offset) 63 | depth = points[:, 2] 64 | 65 | # Ignore rays which do not correspond to any grid cells in the BEV 66 | valid = (rayid > 0) & (rayid < max_rays) & (depth > 0) 67 | rayid = rayid[valid] 68 | depth = depth[valid] 69 | 70 | # Find the LiDAR point with maximum depth within each bin 71 | max_depth = np.zeros((max_rays,)) 72 | np.maximum.at(max_depth, rayid.astype(np.int32), depth) 73 | 74 | # For each bev grid point, sample the max depth along the corresponding ray 75 | x = np.arange(x1, x2, resolution) 76 | z = np.arange(z1, z2, resolution)[:, None] 77 | grid_rayid = np.round(x / z / ray_width - ray_offset).astype(np.int32) 78 | grid_max_depth = max_depth[grid_rayid] 79 | 80 | # A grid position is considered occluded if the there are no LiDAR points 81 | # passing through it 82 | occluded = grid_max_depth < z 83 | return occluded 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /src/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tom-roddick/mono-semantic-maps/f6b0f52857a4e622ae42bf24faaf9681cc089f76/src/models/__init__.py -------------------------------------------------------------------------------- /src/models/criterion.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from ..nn.losses import balanced_binary_cross_entropy, uncertainty_loss, \ 4 | kl_divergence_loss, focal_loss, prior_offset_loss, prior_uncertainty_loss 5 | 6 | # class OccupancyCriterion(nn.Module): 7 | 8 | # def __init__(self, xent_weight=1., uncert_weight=0., class_weights=None): 9 | # super().__init__() 10 | 11 | # self.xent_weight = xent_weight 12 | # self.uncert_weight = uncert_weight 13 | 14 | # if class_weights is None: 15 | # self.class_weights = torch.ones(1) 16 | # else: 17 | # self.class_weights = torch.tensor(class_weights) 18 | 19 | 20 | # def forward(self, logits, labels, mask, *args): 21 | 22 | # # Compute binary cross entropy loss 23 | # self.class_weights = self.class_weights.to(logits) 24 | # bce_loss = balanced_binary_cross_entropy( 25 | # logits, labels, mask, self.class_weights) 26 | 27 | # # Compute uncertainty loss for unknown image regions 28 | # uncert_loss = uncertainty_loss(logits, mask) 29 | 30 | # return bce_loss * self.xent_weight + uncert_loss * self.uncert_weight 31 | 32 | 33 | class OccupancyCriterion(nn.Module): 34 | 35 | def __init__(self, priors, xent_weight=1., uncert_weight=0., 36 | weight_mode='sqrt_inverse'): 37 | super().__init__() 38 | 39 | self.xent_weight = xent_weight 40 | self.uncert_weight = uncert_weight 41 | 42 | self.priors = torch.tensor(priors) 43 | 44 | if weight_mode == 'inverse': 45 | self.class_weights = 1 / self.priors 46 | elif weight_mode == 'sqrt_inverse': 47 | self.class_weights = torch.sqrt(1 / self.priors) 48 | elif weight_mode == 'equal': 49 | self.class_weights = torch.ones_like(self.priors) 50 | else: 51 | raise ValueError('Unknown weight mode option: ' + weight_mode) 52 | 53 | 54 | def forward(self, logits, labels, mask, *args): 55 | 56 | # Compute binary cross entropy loss 57 | self.class_weights = self.class_weights.to(logits) 58 | bce_loss = balanced_binary_cross_entropy( 59 | logits, labels, mask, self.class_weights) 60 | 61 | # Compute uncertainty loss for unknown image regions 62 | self.priors = self.priors.to(logits) 63 | uncert_loss = prior_uncertainty_loss(logits, mask, self.priors) 64 | 65 | return bce_loss * self.xent_weight + uncert_loss * self.uncert_weight 66 | 67 | 68 | 69 | class FocalLossCriterion(nn.Module): 70 | 71 | def __init__(self, alpha, gamma): 72 | super().__init__() 73 | self.alpha = alpha 74 | self.gamma = gamma 75 | 76 | def forward(self, logits, labels, mask, *args): 77 | return focal_loss(logits, labels, mask, self.alpha, self.gamma) 78 | 79 | 80 | class PriorOffsetCriterion(nn.Module): 81 | 82 | def __init__(self, priors): 83 | super().__init__() 84 | self.priors = priors 85 | 86 | def forward(self, logits, labels, mask, *args): 87 | return prior_offset_loss(logits, labels, mask, self.priors) 88 | 89 | 90 | 91 | 92 | class VaeOccupancyCriterion(OccupancyCriterion): 93 | 94 | def __init__(self, priors, xent_weight=0.9, uncert_weight=0., weight_mode='sqrt_inverse', kld_weight=0.1): 95 | super().__init__(priors, xent_weight, uncert_weight, weight_mode) 96 | 97 | self.kld_weight = kld_weight 98 | 99 | def forward(self, logits, labels, mask, mu, logvar): 100 | 101 | kld_loss = kl_divergence_loss(mu, logvar) 102 | occ_loss = super().forward(logits, labels, mask) 103 | return occ_loss + kld_loss * self.kld_weight 104 | -------------------------------------------------------------------------------- /src/models/model_factory.py: -------------------------------------------------------------------------------- 1 | import math 2 | from operator import mul 3 | from functools import reduce 4 | import torch.nn as nn 5 | 6 | from .pyramid import PyramidOccupancyNetwork 7 | from .ved import VariationalEncoderDecoder 8 | from .vpn import VPNModel 9 | from .criterion import OccupancyCriterion, VaeOccupancyCriterion, \ 10 | FocalLossCriterion, PriorOffsetCriterion 11 | 12 | from ..nn.fpn import FPN50 13 | from ..nn.topdown import TopdownNetwork 14 | from ..nn.pyramid import TransformerPyramid 15 | from ..nn.classifier import LinearClassifier, BayesianClassifier 16 | 17 | 18 | 19 | def build_model(model_name, config): 20 | 21 | if model_name == 'pyramid': 22 | model = build_pyramid_occupancy_network(config) 23 | elif model_name == 'ved': 24 | model = build_variational_encoder_decoder(config) 25 | elif model_name == 'vpn': 26 | model = build_view_parsing_network(config) 27 | else: 28 | raise ValueError("Unknown model name '{}'".format(model_name)) 29 | 30 | if len(config.gpus) > 1: 31 | model = nn.DataParallel(model.cuda(), config.gpus) 32 | elif len(config.gpus) == 1: 33 | model.cuda() 34 | 35 | return model 36 | 37 | 38 | def build_criterion(model_name, config): 39 | 40 | if model_name == 'ved': 41 | criterion = VaeOccupancyCriterion(config.prior, 42 | config.xent_weight, 43 | config.uncert_weight, 44 | config.weight_mode, 45 | config.kld_weight, 46 | ) 47 | 48 | elif config.loss_fn == 'focal': 49 | criterion = FocalLossCriterion(config.focal.alpha, config.focal.gamma) 50 | elif config.loss_fn == 'prior': 51 | criterion = PriorOffsetCriterion(config.prior) 52 | else: 53 | criterion = OccupancyCriterion(config.prior, config.xent_weight, 54 | config.uncert_weight, config.weight_mode) 55 | 56 | if len(config.gpus) > 0: 57 | criterion.cuda() 58 | 59 | return criterion 60 | 61 | 62 | 63 | def build_pyramid_occupancy_network(config): 64 | 65 | # Build frontend 66 | frontend = FPN50() 67 | 68 | # Build transformer pyramid 69 | tfm_resolution = config.map_resolution * reduce(mul, config.topdown.strides) 70 | transformer = TransformerPyramid(256, config.tfm_channels, tfm_resolution, 71 | config.map_extents, config.ymin, 72 | config.ymax, config.focal_length) 73 | 74 | # Build topdown network 75 | topdown = TopdownNetwork(config.tfm_channels, config.topdown.channels, 76 | config.topdown.layers, config.topdown.strides, 77 | config.topdown.blocktype) 78 | 79 | # Build classifier 80 | if config.bayesian: 81 | classifier = BayesianClassifier(topdown.out_channels, config.num_class) 82 | else: 83 | classifier = LinearClassifier(topdown.out_channels, config.num_class) 84 | classifier.initialise(config.prior) 85 | 86 | # Assemble Pyramid Occupancy Network 87 | return PyramidOccupancyNetwork(frontend, transformer, topdown, classifier) 88 | 89 | 90 | 91 | def build_variational_encoder_decoder(config): 92 | 93 | return VariationalEncoderDecoder(config.num_class, 94 | config.ved.bottleneck_dim, 95 | config.map_extents, 96 | config.map_resolution) 97 | 98 | 99 | def build_view_parsing_network(config): 100 | 101 | return VPNModel(1, config.num_class, config.vpn.output_size, 102 | config.vpn.fc_dim, config.map_extents, 103 | config.map_resolution) 104 | 105 | 106 | -------------------------------------------------------------------------------- /src/models/pyramid.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | class PyramidOccupancyNetwork(nn.Module): 8 | 9 | 10 | def __init__(self, frontend, transformer, topdown, classifier): 11 | super().__init__() 12 | 13 | 14 | self.frontend = frontend 15 | self.transformer = transformer 16 | self.topdown = topdown 17 | self.classifier = classifier 18 | 19 | 20 | def forward(self, image, calib, *args): 21 | 22 | # Extract multiscale feature maps 23 | feature_maps = self.frontend(image) 24 | 25 | # Transform image features to birds-eye-view 26 | bev_feats = self.transformer(feature_maps, calib) 27 | 28 | # Apply topdown network 29 | td_feats = self.topdown(bev_feats) 30 | 31 | # Predict individual class log-probabilities 32 | logits = self.classifier(td_feats) 33 | return logits -------------------------------------------------------------------------------- /src/models/ved.py: -------------------------------------------------------------------------------- 1 | """ 2 | This implementation of the model from the paper "Monocular Semantic Occupancy 3 | Grid Mapping with Convolutional Variational Encoder-Decoder Networks" is 4 | directly adapted from the code provided by the original authors at 5 | https://github.com/Chenyang-Lu/mono-semantic-occupancy (accessed 08/06/2020). 6 | 7 | Modifications to the original code are identified in comments. 8 | 9 | MIT License 10 | 11 | Copyright (c) 2019 12 | 13 | Permission is hereby granted, free of charge, to any person obtaining a copy 14 | of this software and associated documentation files (the "Software"), to deal 15 | in the Software without restriction, including without limitation the rights 16 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 17 | copies of the Software, and to permit persons to whom the Software is 18 | furnished to do so, subject to the following conditions: 19 | 20 | The above copyright notice and this permission notice shall be included in all 21 | copies or substantial portions of the Software. 22 | 23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 24 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 25 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 26 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 27 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 28 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 29 | SOFTWARE. 30 | """ 31 | 32 | import numpy as np 33 | import torch 34 | import torch.nn as nn 35 | import torch.nn.functional as F 36 | import torchvision.models as models 37 | 38 | from ..nn import losses 39 | 40 | 41 | class VariationalEncoderDecoder(nn.Module): 42 | 43 | def __init__(self, num_class, bottleneck_dim, map_extents, map_resolution): 44 | 45 | super().__init__() 46 | self.model = VaeMapping(num_class, bottleneck_dim) 47 | self.output_size = ( 48 | int((map_extents[3] - map_extents[1]) / map_resolution), 49 | int((map_extents[2] - map_extents[0]) / map_resolution), 50 | ) 51 | 52 | 53 | def forward(self, image, *args): 54 | 55 | # Downsample input image so that it more closely matches 56 | # the input dimensions used in the original paper 57 | image = image[:, :, ::2, ::2] 58 | 59 | # Run model forwards 60 | logits, mu, logvar = self.model(image, self.output_size, self.training) 61 | 62 | return logits, mu, logvar 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | def get_upsampling_weight(in_channels, out_channels, kernel_size): 71 | """Make a 2D bilinear kernel suitable for upsampling""" 72 | factor = (kernel_size + 1) // 2 73 | if kernel_size % 2 == 1: 74 | center = factor - 1 75 | else: 76 | center = factor - 0.5 77 | og = np.ogrid[:kernel_size, :kernel_size] 78 | filt = (1 - abs(og[0] - center) / factor) * \ 79 | (1 - abs(og[1] - center) / factor) 80 | weight = np.zeros((in_channels, out_channels, kernel_size, kernel_size), 81 | dtype=np.float64) 82 | weight[range(in_channels), range(out_channels), :, :] = filt 83 | return torch.from_numpy(weight).float() 84 | 85 | 86 | class upsample(nn.Module): 87 | 88 | def __init__(self, if_deconv, channels=None): 89 | super(upsample, self).__init__() 90 | if if_deconv: 91 | self.upsample = nn.ConvTranspose2d( 92 | channels, channels, 4, stride=2, padding=1, bias=False) 93 | else: 94 | self.upsample = nn.Upsample( 95 | scale_factor=2, mode='bilinear', align_corners=True) 96 | 97 | def forward(self, x): 98 | x = self.upsample(x) 99 | 100 | return x 101 | 102 | 103 | class double_conv(nn.Module): 104 | 105 | def __init__(self, in_ch, out_ch): 106 | super(double_conv, self).__init__() 107 | 108 | self.conv = nn.Sequential( 109 | nn.Conv2d(in_ch, out_ch, 3, padding=1), 110 | nn.BatchNorm2d(out_ch), 111 | nn.ReLU(), 112 | nn.Conv2d(out_ch, out_ch, 3, padding=1), 113 | nn.BatchNorm2d(out_ch), 114 | nn.ReLU() 115 | ) 116 | 117 | def forward(self, x): 118 | x = self.conv(x) 119 | return x 120 | 121 | 122 | class encoder_after_vgg(nn.Module): 123 | 124 | def __init__(self, bottleneck_dim=32): 125 | super(encoder_after_vgg, self).__init__() 126 | 127 | self.conv = nn.Sequential( 128 | nn.Conv2d(512, 256, 3, padding=1), 129 | nn.BatchNorm2d(256), 130 | nn.ReLU(), 131 | nn.Conv2d(256, 128, 3, padding=1), 132 | nn.BatchNorm2d(128), 133 | nn.ReLU(), 134 | nn.MaxPool2d(2) 135 | ) 136 | 137 | # MODIFIED: The original VED paper assumed fixed input dimensions of 138 | # 256x512, which leads to a bottleneck dimension of 8x4. Since our 139 | # input size varies depending on dataset we have to specify the 140 | # bottleneck dimension manually. 141 | self.mu_dec = nn.Linear(bottleneck_dim * 128, 512) 142 | self.logvar_dec = nn.Linear(bottleneck_dim * 128, 512) 143 | 144 | 145 | def forward(self, x): 146 | x = self.conv(x) 147 | x = x.flatten(1, 3) 148 | mu = self.mu_dec(x) 149 | logvar = self.logvar_dec(x) 150 | 151 | return mu, logvar 152 | 153 | 154 | class decoder_conv(nn.Module): 155 | def __init__(self, num_class, if_deconv=True): 156 | super(decoder_conv, self).__init__() 157 | 158 | self.up1 = upsample(if_deconv=if_deconv, channels=128) 159 | self.conv1 = double_conv(128, 256) 160 | self.up2 = upsample(if_deconv=if_deconv, channels=256) 161 | self.conv2 = double_conv(256, 256) 162 | self.up3 = upsample(if_deconv=if_deconv, channels=256) 163 | self.conv3 = double_conv(256, 256) 164 | self.up4 = upsample(if_deconv=if_deconv, channels=256) 165 | self.conv4 = double_conv(256, 256) 166 | self.up5 = upsample(if_deconv=if_deconv, channels=256) 167 | self.conv5 = double_conv(256, 256) 168 | 169 | # MODIFIED: Add an additional upsampling layer 170 | self.up6 = upsample(if_deconv=if_deconv, channels=256) 171 | self.conv6 = double_conv(256, 256) 172 | 173 | self.conv_out = nn.Conv2d(256, num_class, 3, padding=1) 174 | 175 | self._initialize_weights() 176 | 177 | def _initialize_weights(self): 178 | for m in self.modules(): 179 | if isinstance(m, nn.ConvTranspose2d): 180 | assert m.kernel_size[0] == m.kernel_size[1] 181 | initial_weight = get_upsampling_weight( 182 | m.in_channels, m.out_channels, m.kernel_size[0]) 183 | m.weight.data.copy_(initial_weight) 184 | 185 | def forward(self, x, output_size): 186 | x = x.view(-1, 128, 2, 2) 187 | x = self.up1(x) 188 | x = self.conv1(x) 189 | 190 | x = self.up2(x) 191 | x = self.conv2(x) 192 | 193 | x = self.up3(x) 194 | x = self.conv3(x) 195 | 196 | x = self.up4(x) 197 | x = self.conv4(x) 198 | 199 | x = self.up5(x) 200 | x = self.conv5(x) 201 | 202 | # MODIFIED: Add additional upsampling layer 203 | x = self.up6(x) 204 | x = self.conv6(x) 205 | 206 | # MODIFIED: Resample to match label dimensions 207 | x = F.upsample(x, size=output_size, mode='bilinear') 208 | 209 | x = self.conv_out(x) 210 | 211 | return x 212 | 213 | 214 | class VaeMapping(nn.Module): 215 | 216 | def __init__(self, num_class, bottleneck_dim=32): 217 | super(VaeMapping, self).__init__() 218 | 219 | self.vgg16 = models.vgg16_bn(pretrained=True) 220 | self.vgg16_feature = nn.Sequential(*list(self.vgg16.features.children())[:]) 221 | self.encoder_afterv_vgg = encoder_after_vgg(bottleneck_dim) 222 | self.decoder = decoder_conv(num_class, if_deconv=True) 223 | 224 | def reparameterize(self, is_training, mu, logvar): 225 | if is_training: 226 | std = torch.exp(0.5*logvar) 227 | eps = torch.randn_like(std) 228 | return eps.mul(std).add_(mu) 229 | else: 230 | return mu 231 | 232 | def forward(self, x, output_size, is_training=False, defined_mu=None): 233 | 234 | x = self.vgg16_feature(x) 235 | mu, logvar = self.encoder_afterv_vgg(x) 236 | z = self.reparameterize(is_training, mu, logvar) 237 | if defined_mu is not None: 238 | z = defined_mu 239 | pred_map = self.decoder(z, output_size) 240 | 241 | return pred_map, mu, logvar 242 | 243 | 244 | def loss_function_map(pred_map, map, mu, logvar): 245 | 246 | # MODIFIED: move weights to same GPU as inputs 247 | CE = F.cross_entropy(pred_map, map.view(-1, 64, 64), weight= 248 | torch.Tensor([0.6225708, 2.53963754, 15.46416047, 0.52885405]).to(map), ignore_index=4) 249 | KLD = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp()) 250 | 251 | return 0.9*CE + 0.1*KLD, CE, KLD 252 | 253 | -------------------------------------------------------------------------------- /src/models/vpn.py: -------------------------------------------------------------------------------- 1 | """ 2 | This implementation of the model from the paper "Cross-view Semantic 3 | Segmentation for Sensing Surroundings" is directly adapted from the code 4 | provided by the original authors at 5 | https://github.com/pbw-Berwin/View-Parsing-Network (accessed 08/06/2020) 6 | 7 | """ 8 | 9 | 10 | # File : models.py 11 | # Author : Bowen Pan 12 | # Email : panbowen0607@gmail.com 13 | # Date : 09/18/2018 14 | # 15 | # Distributed under terms of the MIT license. 16 | import os 17 | import sys 18 | import math 19 | import torch 20 | from torch import nn 21 | from collections import OrderedDict 22 | import torch.nn.functional as F 23 | 24 | import numpy as np 25 | from itertools import combinations 26 | 27 | from torchvision.models import resnet 28 | 29 | try: 30 | from urllib import urlretrieve 31 | except ImportError: 32 | from urllib.request import urlretrieve 33 | 34 | 35 | class TransformModule(nn.Module): 36 | def __init__(self, dim=(37, 60), num_view=8): 37 | super(TransformModule, self).__init__() 38 | self.num_view = num_view 39 | self.dim = dim 40 | self.mat_list = nn.ModuleList() 41 | 42 | # MODIFIED: dims need not be square 43 | for i in range(self.num_view): 44 | fc_transform = nn.Sequential( 45 | nn.Linear(dim[0] * dim[1], dim[0] * dim[1]), 46 | nn.ReLU(), 47 | nn.Linear(dim[0] * dim[1], dim[0] * dim[1]), 48 | nn.ReLU() 49 | ) 50 | self.mat_list += [fc_transform] 51 | 52 | def forward(self, x): 53 | # shape x: B, V, C, H, W 54 | x = x.view(list(x.size()[:3]) + [self.dim[0] * self.dim[1],]) 55 | view_comb = self.mat_list[0](x[:, 0]) 56 | for index in range(x.size(1))[1:]: 57 | view_comb += self.mat_list[index](x[:, index]) 58 | view_comb = view_comb.view(list(view_comb.size()[:2]) + list(self.dim)) 59 | return view_comb 60 | 61 | 62 | class SumModule(nn.Module): 63 | def __init__(self): 64 | super(SumModule, self).__init__() 65 | 66 | def forward(self, x): 67 | # shape x: B, V, C, H, W 68 | x = torch.sum(x, dim=1, keepdim=False) 69 | return x 70 | 71 | 72 | class VPNModel(nn.Module): 73 | def __init__(self, num_views, num_class, output_size, fc_dim, map_extents, 74 | map_resolution): 75 | 76 | super(VPNModel, self).__init__() 77 | self.num_views = num_views 78 | self.output_size = output_size 79 | 80 | self.seg_size = ( 81 | int((map_extents[3] - map_extents[1]) / map_resolution), 82 | int((map_extents[2] - map_extents[0]) / map_resolution), 83 | ) 84 | 85 | # MODIFIED: we fix the transform module, the encoder and decoder to be 86 | # the ones described in the paper 87 | self.encoder = resnet18(True) 88 | self.transform_module = TransformModule(dim=self.output_size, 89 | num_view=self.num_views) 90 | self.decoder = PPMBilinear(num_class, fc_dim, False) 91 | 92 | def forward(self, x, *args): 93 | B, N, C, H, W = x.view([-1, self.num_views, int(x.size()[1] / self.num_views)] \ 94 | + list(x.size()[2:])).size() 95 | 96 | x = x.view(B*N, C, H, W) 97 | x = self.encoder(x)[0] 98 | x = x.view([B, N] + list(x.size()[1:])) 99 | x = self.transform_module(x) 100 | x = self.decoder([x], self.seg_size) 101 | 102 | return x 103 | 104 | 105 | 106 | def conv3x3(in_planes, out_planes, stride=1): 107 | "3x3 convolution with padding" 108 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 109 | padding=1, bias=False) 110 | 111 | class ResNet(nn.Module): 112 | 113 | def __init__(self, block, layers): 114 | self.inplanes = 128 115 | super(ResNet, self).__init__() 116 | self.conv1 = conv3x3(3, 64, stride=2) 117 | self.bn1 = nn.BatchNorm2d(64) 118 | self.relu1 = nn.ReLU(inplace=True) 119 | self.conv2 = conv3x3(64, 64) 120 | self.bn2 = nn.BatchNorm2d(64) 121 | self.relu2 = nn.ReLU(inplace=True) 122 | self.conv3 = conv3x3(64, 128) 123 | self.bn3 = nn.BatchNorm2d(128) 124 | self.relu3 = nn.ReLU(inplace=True) 125 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 126 | 127 | self.layer1 = self._make_layer(block, 64, layers[0]) 128 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 129 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 130 | # self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 131 | 132 | 133 | for m in self.modules(): 134 | if isinstance(m, nn.Conv2d): 135 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 136 | m.weight.data.normal_(0, math.sqrt(2. / n)) 137 | elif isinstance(m, nn.BatchNorm2d): 138 | m.weight.data.fill_(1) 139 | m.bias.data.zero_() 140 | 141 | def _make_layer(self, block, planes, blocks, stride=1): 142 | downsample = None 143 | if stride != 1 or self.inplanes != planes * block.expansion: 144 | downsample = nn.Sequential( 145 | nn.Conv2d(self.inplanes, planes * block.expansion, 146 | kernel_size=1, stride=stride, bias=False), 147 | nn.BatchNorm2d(planes * block.expansion), 148 | ) 149 | 150 | layers = [] 151 | layers.append(block(self.inplanes, planes, stride, downsample)) 152 | self.inplanes = planes * block.expansion 153 | for _ in range(1, blocks): 154 | layers.append(block(self.inplanes, planes)) 155 | 156 | return nn.Sequential(*layers) 157 | 158 | def forward(self, x, return_feature_maps=False): 159 | x = self.relu1(self.bn1(self.conv1(x))) 160 | x = self.relu2(self.bn2(self.conv2(x))) 161 | x = self.relu3(self.bn3(self.conv3(x))) 162 | x = self.maxpool(x) 163 | 164 | conv_out = [] 165 | x = self.layer1(x); conv_out.append(x) 166 | x = self.layer2(x); conv_out.append(x) 167 | x = self.layer3(x); conv_out.append(x) 168 | # x = self.layer4(x) 169 | 170 | if return_feature_maps: 171 | return conv_out 172 | return [x] 173 | 174 | 175 | def resnet18(pretrained=False, **kwargs): 176 | model = ResNet(resnet.BasicBlock, [2, 2, 2, 2], **kwargs) 177 | if pretrained: 178 | weights = load_url('http://sceneparsing.csail.mit.edu/model/'\ 179 | 'pretrained_resnet/resnet18-imagenet.pth') 180 | state_dict = model.state_dict() 181 | for key, weight in state_dict.items(): 182 | weight.copy_(weights[key]) 183 | model.load_state_dict(state_dict) 184 | return model 185 | 186 | 187 | 188 | def load_url(url, model_dir='./pretrained', map_location=None): 189 | if not os.path.exists(model_dir): 190 | os.makedirs(model_dir) 191 | filename = url.split('/')[-1] 192 | cached_file = os.path.join(model_dir, filename) 193 | if not os.path.exists(cached_file): 194 | sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file)) 195 | urlretrieve(url, cached_file) 196 | return torch.load(cached_file, map_location=map_location) 197 | 198 | 199 | 200 | # pyramid pooling, bilinear upsample 201 | class PPMBilinear(nn.Module): 202 | def __init__(self, num_class=150, fc_dim=4096, 203 | use_softmax=False, pool_scales=(1, 2, 3, 6)): 204 | super(PPMBilinear, self).__init__() 205 | self.use_softmax = use_softmax 206 | 207 | self.ppm = [] 208 | for scale in pool_scales: 209 | self.ppm.append(nn.Sequential( 210 | nn.AdaptiveAvgPool2d(scale), 211 | nn.Conv2d(fc_dim, 512, kernel_size=1, bias=False), 212 | nn.BatchNorm2d(512), 213 | nn.ReLU(inplace=True) 214 | )) 215 | self.ppm = nn.ModuleList(self.ppm) 216 | 217 | self.conv_last = nn.Sequential( 218 | nn.Conv2d(fc_dim+len(pool_scales)*512, 512, 219 | kernel_size=3, padding=1, bias=False), 220 | nn.BatchNorm2d(512), 221 | nn.ReLU(inplace=True), 222 | nn.Dropout2d(0.1), 223 | nn.Conv2d(512, num_class, kernel_size=1) 224 | ) 225 | 226 | def forward(self, conv_out, segSize=None, return_feat=False): 227 | conv5 = conv_out[-1] 228 | 229 | input_size = conv5.size() 230 | ppm_out = [conv5] 231 | for pool_scale in self.ppm: 232 | ppm_out.append(nn.functional.upsample( 233 | pool_scale(conv5), 234 | (input_size[2], input_size[3]), 235 | mode='bilinear', align_corners=False)) 236 | ppm_out = torch.cat(ppm_out, 1) 237 | 238 | x = self.conv_last(ppm_out) 239 | feat = x 240 | if segSize is not None: 241 | x = nn.functional.upsample( 242 | x, size=segSize, mode='bilinear', align_corners=False) 243 | if self.use_softmax: # is True during inference 244 | x = nn.functional.softmax(x, dim=1) 245 | 246 | # MODIFIED: we use BCE loss, so do not convert to log-softmax 247 | if return_feat: 248 | return x, feat 249 | return x -------------------------------------------------------------------------------- /src/nn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tom-roddick/mono-semantic-maps/f6b0f52857a4e622ae42bf24faaf9681cc089f76/src/nn/__init__.py -------------------------------------------------------------------------------- /src/nn/classifier.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | class LinearClassifier(nn.Conv2d): 7 | 8 | def __init__(self, in_channels, num_class): 9 | super().__init__(in_channels, num_class, 1) 10 | 11 | def initialise(self, prior): 12 | prior = torch.tensor(prior) 13 | self.weight.data.zero_() 14 | self.bias.data.copy_(torch.log(prior / (1 - prior))) 15 | 16 | 17 | 18 | class BayesianClassifier(nn.Module): 19 | 20 | def __init__(self, in_channels, num_class, num_samples=40): 21 | super().__init__() 22 | self.conv = nn.Conv2d(in_channels, num_class, 1) 23 | self.num_samples = num_samples 24 | 25 | def initialise(self, prior): 26 | prior = torch.tensor(prior) 27 | self.conv.weight.data.zero_() 28 | self.conv.bias.data.copy_(torch.log(prior / (1 - prior))) 29 | 30 | def forward(self, features): 31 | 32 | if self.training: 33 | # At training time, apply dropout once 34 | features = F.dropout2d(features, 0.5, training=True) 35 | logits = self.conv(features) 36 | 37 | else: 38 | # At test time, apply dropout multiple times and average the result 39 | mean_score = 0 40 | for _ in range(self.num_samples): 41 | drop_feats = F.dropout2d(features, 0.5, training=True) 42 | mean_score += F.sigmoid(self.conv(drop_feats)) 43 | mean_score = mean_score / self.num_samples 44 | 45 | # Convert back into logits format 46 | logits = torch.log(mean_score) - torch.log1p(-mean_score) 47 | 48 | return logits 49 | 50 | 51 | -------------------------------------------------------------------------------- /src/nn/fpn.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Adapted from the implementation of 3 | https://github.com/kuangliu/pytorch-retinanet/ 4 | ''' 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | 9 | from torchvision.models.utils import load_state_dict_from_url 10 | 11 | from .resnet import ResNetLayer 12 | 13 | 14 | class FPN(nn.Module): 15 | def __init__(self, num_blocks): 16 | super(FPN, self).__init__() 17 | self.in_planes = 64 18 | 19 | self.conv1 = nn.Conv2d( 20 | 3, 64, kernel_size=7, stride=2, padding=3, bias=False) 21 | self.bn1 = nn.BatchNorm2d(64) 22 | 23 | # Bottom-up layers 24 | self.layer1 = ResNetLayer(64, 64, num_blocks[0], stride=1) 25 | self.layer2 = ResNetLayer(256, 128, num_blocks[1], stride=2) 26 | self.layer3 = ResNetLayer(512, 256, num_blocks[2], stride=2) 27 | self.layer4 = ResNetLayer(1024, 512, num_blocks[3], stride=2) 28 | self.conv6 = nn.Conv2d(2048, 256, kernel_size=3, stride=2, padding=1) 29 | self.conv7 = nn.Conv2d( 256, 256, kernel_size=3, stride=2, padding=1) 30 | 31 | # Lateral layers 32 | self.latlayer1 = nn.Conv2d(2048, 256, kernel_size=1, stride=1, padding=0) 33 | self.latlayer2 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0) 34 | self.latlayer3 = nn.Conv2d( 512, 256, kernel_size=1, stride=1, padding=0) 35 | 36 | # Top-down layers 37 | self.toplayer1 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1) 38 | self.toplayer2 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1) 39 | 40 | # ImageNet normalization parameters 41 | self.register_buffer('mean', torch.tensor([0.485, 0.456, 0.406])) 42 | self.register_buffer('std', torch.tensor([0.229, 0.224, 0.225])) 43 | 44 | 45 | def load_pretrained(self, path): 46 | pretrained = load_state_dict_from_url(path, progress=True) 47 | state_dict = self.state_dict() 48 | for key, weights in pretrained.items(): 49 | if key in state_dict: 50 | state_dict[key].copy_(weights) 51 | 52 | self.load_state_dict(state_dict) 53 | 54 | 55 | def _upsample_add(self, x, y): 56 | '''Upsample and add two feature maps. 57 | 58 | Args: 59 | x: (Variable) top feature map to be upsampled. 60 | y: (Variable) lateral feature map. 61 | 62 | Returns: 63 | (Variable) added feature map. 64 | 65 | Note in PyTorch, when input size is odd, the upsampled feature map 66 | with `F.upsample(..., scale_factor=2, mode='nearest')` 67 | maybe not equal to the lateral feature map size. 68 | 69 | e.g. 70 | original input size: [N,_,15,15] -> 71 | conv2d feature map size: [N,_,8,8] -> 72 | upsampled feature map size: [N,_,16,16] 73 | 74 | So we choose bilinear upsample which supports arbitrary output sizes. 75 | ''' 76 | _,_,H,W = y.size() 77 | return F.upsample(x, size=(H,W), mode='bilinear') + y 78 | 79 | def forward(self, x): 80 | 81 | # Normalize image 82 | x = (x - self.mean.view(-1, 1, 1)) / self.std.view(-1, 1, 1) 83 | 84 | # Bottom-up 85 | c1 = F.relu(self.bn1(self.conv1(x))) 86 | c1 = F.max_pool2d(c1, kernel_size=3, stride=2, padding=1) 87 | c2 = self.layer1(c1) 88 | c3 = self.layer2(c2) 89 | c4 = self.layer3(c3) 90 | c5 = self.layer4(c4) 91 | p6 = self.conv6(c5) 92 | p7 = self.conv7(F.relu(p6)) 93 | # Top-down 94 | p5 = self.latlayer1(c5) 95 | p4 = self._upsample_add(p5, self.latlayer2(c4)) 96 | p4 = self.toplayer1(p4) 97 | p3 = self._upsample_add(p4, self.latlayer3(c3)) 98 | p3 = self.toplayer2(p3) 99 | return p3, p4, p5, p6, p7 100 | 101 | 102 | def FPN50(): 103 | fpn = FPN([3,4,6,3]) 104 | fpn.load_pretrained( 105 | 'https://download.pytorch.org/models/resnet50-19c8e357.pth') 106 | return fpn 107 | 108 | def FPN101(): 109 | fpn = FPN([2,4,23,3]) 110 | fpn.load_pretrained( 111 | 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth') 112 | return fpn 113 | -------------------------------------------------------------------------------- /src/nn/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | INV_LOG2 = 0.693147 5 | 6 | 7 | def balanced_binary_cross_entropy(logits, labels, mask, weights): 8 | weights = (logits.new(weights).view(-1, 1, 1) - 1) * labels.float() + 1. 9 | weights = weights * mask.unsqueeze(1).float() 10 | return F.binary_cross_entropy_with_logits(logits, labels.float(), weights) 11 | 12 | 13 | def uncertainty_loss(x, mask): 14 | """ 15 | Loss which maximizes the uncertainty in invalid regions of the image 16 | """ 17 | labels = ~mask 18 | x = x[labels.unsqueeze(1).expand_as(x)] 19 | xp, xm = x, -x 20 | entropy = xp.sigmoid() * F.logsigmoid(xp) + xm.sigmoid() * F.logsigmoid(xm) 21 | return 1. + entropy.mean() / INV_LOG2 22 | 23 | 24 | def prior_uncertainty_loss(x, mask, priors): 25 | priors = x.new(priors).view(1, -1, 1, 1).expand_as(x) 26 | xent = F.binary_cross_entropy_with_logits(x, priors, reduce=False) 27 | return (xent * (~mask).float().unsqueeze(1)).mean() 28 | 29 | 30 | def kl_divergence_loss(mu, logvar): 31 | return -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp()) 32 | 33 | 34 | def focal_loss(logits, labels, mask, alpha=0.5, gamma=2): 35 | 36 | bce_loss = F.binary_cross_entropy_with_logits(logits, labels.float(), 37 | reduce=False) 38 | pt = torch.exp(-bce_loss) 39 | at = pt.new([alpha, 1 - alpha])[labels.long()] 40 | focal_loss = at * (1 - pt) ** gamma * bce_loss 41 | 42 | return (focal_loss * mask.unsqueeze(1).float()).mean() 43 | 44 | 45 | def prior_offset_loss(logits, labels, mask, priors): 46 | 47 | priors = logits.new(priors).view(-1, 1, 1) 48 | prior_logits = torch.log(priors / (1 - priors)) 49 | labels = labels.float() 50 | 51 | weights = .5 / priors * labels + .5 / (1 - priors) * (1 - labels) 52 | weights = weights * mask.unsqueeze(1).float() 53 | return F.binary_cross_entropy_with_logits(logits - prior_logits, labels, 54 | weights) 55 | -------------------------------------------------------------------------------- /src/nn/pyramid.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | 5 | from .transformer import DenseTransformer 6 | 7 | class TransformerPyramid(nn.Module): 8 | 9 | def __init__(self, in_channels, channels, resolution, extents, ymin, ymax, 10 | focal_length): 11 | 12 | super().__init__() 13 | self.transformers = nn.ModuleList() 14 | for i in range(5): 15 | 16 | # Scaled focal length for each transformer 17 | focal = focal_length / pow(2, i + 3) 18 | 19 | # Compute grid bounds for each transformer 20 | zmax = min(math.floor(focal * 2) * resolution, extents[3]) 21 | zmin = math.floor(focal) * resolution if i < 4 else extents[1] 22 | subset_extents = [extents[0], zmin, extents[2], zmax] 23 | # Build transformers 24 | tfm = DenseTransformer(in_channels, channels, resolution, 25 | subset_extents, ymin, ymax, focal) 26 | self.transformers.append(tfm) 27 | 28 | 29 | def forward(self, feature_maps, calib): 30 | 31 | bev_feats = list() 32 | for i, fmap in enumerate(feature_maps): 33 | 34 | # Scale calibration matrix to account for downsampling 35 | scale = 8 * 2 ** i 36 | calib_downsamp = calib.clone() 37 | calib_downsamp[:, :2] = calib[:, :2] / scale 38 | 39 | # Apply orthographic transformation to each feature map separately 40 | bev_feats.append(self.transformers[i](fmap, calib_downsamp)) 41 | 42 | # Combine birds-eye-view feature maps along the depth axis 43 | return torch.cat(bev_feats[::-1], dim=-2) 44 | -------------------------------------------------------------------------------- /src/nn/resampler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from .. import utils 6 | 7 | class Resampler(nn.Module): 8 | 9 | def __init__(self, resolution, extents): 10 | super().__init__() 11 | 12 | # Store z positions of the near and far planes 13 | self.near = extents[1] 14 | self.far = extents[3] 15 | 16 | # Make a grid in the x-z plane 17 | self.grid = _make_grid(resolution, extents) 18 | 19 | 20 | def forward(self, features, calib): 21 | 22 | # Copy grid to the correct device 23 | self.grid = self.grid.to(features) 24 | 25 | # We ignore the image v-coordinate, and assume the world Y-coordinate 26 | # is zero, so we only need a 2x2 submatrix of the original 3x3 matrix 27 | calib = calib[:, [0, 2]][..., [0, 2]].view(-1, 1, 1, 2, 2) 28 | 29 | # Transform grid center locations into image u-coordinates 30 | cam_coords = torch.matmul(calib, self.grid.unsqueeze(-1)).squeeze(-1) 31 | 32 | # Apply perspective projection and normalize 33 | ucoords = cam_coords[..., 0] / cam_coords[..., 1] 34 | ucoords = ucoords / features.size(-1) * 2 - 1 35 | 36 | # Normalize z coordinates 37 | zcoords = (cam_coords[..., 1]-self.near) / (self.far-self.near) * 2 - 1 38 | 39 | # Resample 3D feature map 40 | grid_coords = torch.stack([ucoords, zcoords], -1).clamp(-1.1, 1.1) 41 | return F.grid_sample(features, grid_coords) 42 | 43 | 44 | def _make_grid(resolution, extents): 45 | # Create a grid of cooridinates in the birds-eye-view 46 | x1, z1, x2, z2 = extents 47 | zz, xx = torch.meshgrid( 48 | torch.arange(z1, z2, resolution), torch.arange(x1, x2, resolution)) 49 | 50 | return torch.stack([xx, zz], dim=-1) -------------------------------------------------------------------------------- /src/nn/resnet.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | def conv3x3(in_planes, out_planes, stride=1, dilation=1): 6 | """3x3 convolution with padding""" 7 | 8 | # Fractional strides correspond to transpose convolution 9 | if stride < 1: 10 | stride = int(round(1 / stride)) 11 | kernel_size = stride + 2 12 | padding = int((dilation * (kernel_size - 1) - stride + 1) / 2) 13 | return nn.ConvTranspose2d( 14 | in_planes, out_planes, kernel_size, stride, padding, 15 | output_padding=0, dilation=dilation, bias=False) 16 | 17 | # Otherwise return normal convolution 18 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=int(stride), 19 | dilation=dilation, padding=dilation, bias=False) 20 | 21 | 22 | def conv1x1(in_planes, out_planes, stride=1): 23 | """1x1 convolution""" 24 | 25 | # Fractional strides correspond to transpose convolution 26 | if int(1 / stride) > 1: 27 | stride = int(1 / stride) 28 | return nn.ConvTranspose2d( 29 | in_planes, out_planes, kernel_size=stride, stride=stride,bias=False) 30 | 31 | return nn.Conv2d( 32 | in_planes, out_planes, kernel_size=1, stride=int(stride), bias=False) 33 | 34 | 35 | class BasicBlock(nn.Module): 36 | expansion = 1 37 | 38 | def __init__(self, inplanes, planes, stride=1, dilation=1): 39 | super(BasicBlock, self).__init__() 40 | 41 | self.conv1 = conv3x3(inplanes, planes, stride, dilation) 42 | self.bn1 = nn.GroupNorm(16, planes) 43 | 44 | self.conv2 = conv3x3(planes, planes, 1, dilation) 45 | self.bn2 = nn.GroupNorm(16, planes) 46 | 47 | if stride != 1 or inplanes != planes: 48 | self.downsample = nn.Sequential( 49 | conv1x1(inplanes, planes, stride), nn.GroupNorm(16, planes)) 50 | else: 51 | self.downsample = None 52 | 53 | 54 | def forward(self, x): 55 | identity = x 56 | 57 | out = F.relu(self.bn1(self.conv1(x)), inplace=True) 58 | out = self.bn2(self.conv2(out)) 59 | 60 | if self.downsample is not None: 61 | identity = self.downsample(x) 62 | 63 | out += identity 64 | out = F.relu(out, inplace=True) 65 | 66 | return out 67 | 68 | 69 | class Bottleneck(nn.Module): 70 | expansion = 4 71 | 72 | def __init__(self, inplanes, planes, stride=1, dilation=1): 73 | super(Bottleneck, self).__init__() 74 | self.conv1 = conv1x1(inplanes, planes) 75 | self.bn1 = nn.GroupNorm(16, planes) 76 | self.conv2 = conv3x3(planes, planes, stride, dilation) 77 | self.bn2 = nn.GroupNorm(16, planes) 78 | self.conv3 = conv1x1(planes, planes * self.expansion) 79 | self.bn3 = nn.GroupNorm(16, planes * self.expansion) 80 | 81 | if stride != 1 or inplanes != planes * self.expansion: 82 | self.downsample = nn.Sequential( 83 | conv1x1(inplanes, planes * self.expansion, stride), 84 | nn.GroupNorm(16, planes * self.expansion)) 85 | else: 86 | self.downsample = None 87 | 88 | def forward(self, x): 89 | identity = x 90 | 91 | out = F.relu(self.bn1(self.conv1(x)), inplace=True) 92 | out = F.relu(self.bn2(self.conv2(out)), inplace=True) 93 | out = self.bn3(self.conv3(out)) 94 | 95 | if self.downsample is not None: 96 | identity = self.downsample(x) 97 | 98 | out += identity 99 | out = F.relu(out) 100 | 101 | return out 102 | 103 | 104 | class ResNetLayer(nn.Sequential): 105 | 106 | def __init__(self, in_channels, channels, num_blocks, stride=1, 107 | dilation=1, blocktype='bottleneck'): 108 | 109 | # Get block type 110 | if blocktype == 'basic': 111 | block = BasicBlock 112 | elif blocktype == 'bottleneck': 113 | block = Bottleneck 114 | else: 115 | raise Exception("Unknown residual block type: " + str(blocktype)) 116 | 117 | # Construct layers 118 | layers = [block(in_channels, channels, stride, dilation)] 119 | for _ in range(1, num_blocks): 120 | layers.append(block(channels * block.expansion, channels, 1, dilation)) 121 | 122 | self.in_channels = in_channels 123 | self.out_channels = channels * block.expansion 124 | 125 | super(ResNetLayer, self).__init__(*layers) -------------------------------------------------------------------------------- /src/nn/topdown.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from .resnet import ResNetLayer 6 | 7 | class TopdownNetwork(nn.Sequential): 8 | 9 | def __init__(self, in_channels, channels, layers=[6, 1, 1], 10 | strides=[1, 2, 2], blocktype='basic'): 11 | 12 | modules = list() 13 | self.downsample = 1 14 | for nblocks, stride in zip(layers, strides): 15 | 16 | # Add a new residual layer 17 | module = ResNetLayer( 18 | in_channels, channels, nblocks, 1/stride, blocktype=blocktype) 19 | modules.append(module) 20 | 21 | # Halve the number of channels at each layer 22 | in_channels = module.out_channels 23 | channels = channels // 2 24 | self.downsample *= stride 25 | 26 | self.out_channels = in_channels 27 | 28 | 29 | super().__init__(*modules) -------------------------------------------------------------------------------- /src/nn/transformer.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | from .resampler import Resampler 7 | 8 | class DenseTransformer(nn.Module): 9 | 10 | def __init__(self, in_channels, channels, resolution, grid_extents, 11 | ymin, ymax, focal_length, groups=1): 12 | super().__init__() 13 | 14 | # Initial convolution to reduce feature dimensions 15 | self.conv = nn.Conv2d(in_channels, channels, 1) 16 | self.bn = nn.GroupNorm(16, channels) 17 | 18 | # Resampler transforms perspective features to BEV 19 | self.resampler = Resampler(resolution, grid_extents) 20 | 21 | # Compute input height based on region of image covered by grid 22 | self.zmin, zmax = grid_extents[1], grid_extents[3] 23 | self.in_height = math.ceil(focal_length * (ymax - ymin) / self.zmin) 24 | self.ymid = (ymin + ymax) / 2 25 | 26 | # Compute number of output cells required 27 | self.out_depth = math.ceil((zmax - self.zmin) / resolution) 28 | 29 | # Dense layer which maps UV features to UZ 30 | self.fc = nn.Conv1d( 31 | channels * self.in_height, channels * self.out_depth, 1, groups=groups 32 | ) 33 | self.out_channels = channels 34 | 35 | 36 | def forward(self, features, calib, *args): 37 | 38 | # Crop feature maps to a fixed input height 39 | features = torch.stack([self._crop_feature_map(fmap, cal) 40 | for fmap, cal in zip(features, calib)]) 41 | 42 | # Reduce feature dimension to minimize memory usage 43 | features = F.relu(self.bn(self.conv(features))) 44 | 45 | # Flatten height and channel dimensions 46 | B, C, _, W = features.shape 47 | flat_feats = features.flatten(1, 2) 48 | bev_feats = self.fc(flat_feats).view(B, C, -1, W) 49 | 50 | # Resample to orthographic grid 51 | return self.resampler(bev_feats, calib) 52 | 53 | 54 | def _crop_feature_map(self, fmap, calib): 55 | 56 | # Compute upper and lower bounds of visible region 57 | focal_length, img_offset = calib[1, 1:] 58 | vmid = self.ymid * focal_length / self.zmin + img_offset 59 | vmin = math.floor(vmid - self.in_height / 2) 60 | vmax = math.floor(vmid + self.in_height / 2) 61 | 62 | # Pad or crop input tensor to match dimensions 63 | return F.pad(fmap, [0, 0, -vmin, vmax - fmap.shape[-2]]) -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .geometry import * -------------------------------------------------------------------------------- /src/utils/configs.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | from yacs.config import CfgNode 4 | try: 5 | from dotenv import load_dotenv 6 | load_dotenv() 7 | except: 8 | pass 9 | 10 | ROOT = os.path.abspath(os.path.join(__file__, '..', '..', '..')) 11 | 12 | 13 | def load_config(config_path): 14 | with open(config_path) as f: 15 | return CfgNode.load_cfg(f) 16 | 17 | def get_default_configuration(): 18 | defaults_path = os.path.join(ROOT, 'configs/defaults.yml') 19 | return load_config(defaults_path) 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /src/utils/confusion.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class BinaryConfusionMatrix(object): 4 | 5 | def __init__(self, num_class): 6 | self.tp = torch.zeros(num_class, dtype=torch.long) 7 | self.fp = torch.zeros(num_class, dtype=torch.long) 8 | self.fn = torch.zeros(num_class, dtype=torch.long) 9 | self.tn = torch.zeros(num_class, dtype=torch.long) 10 | 11 | 12 | @property 13 | def num_class(self): 14 | return len(self.tp) 15 | 16 | def update(self, preds, labels, mask=None): 17 | 18 | preds = preds.detach().cpu() 19 | labels = labels.detach().cpu() 20 | 21 | # Move batch dimension to the end 22 | preds = preds.flatten(2, -1).permute(1, 0, 2).reshape( 23 | self.num_class, -1) 24 | labels = labels.flatten(2, -1).permute(1, 0, 2).reshape( 25 | self.num_class, -1) 26 | 27 | if mask is not None: 28 | preds = preds[:, mask.flatten()] 29 | labels = labels[:, mask.flatten()] 30 | 31 | 32 | true_pos = preds & labels 33 | false_pos = preds & ~labels 34 | false_neg = ~preds & labels 35 | true_neg = ~preds & ~labels 36 | 37 | # Update global counts 38 | self.tp += true_pos.long().sum(-1) 39 | self.fp += false_pos.long().sum(-1) 40 | self.fn += false_neg.long().sum(-1) 41 | self.tn += true_neg.long().sum(-1) 42 | 43 | 44 | @property 45 | def iou(self): 46 | return self.tp.float() / (self.tp + self.fn + self.fp).float() 47 | 48 | @property 49 | def mean_iou(self): 50 | # Only compute mean over classes with at least one ground truth 51 | valid = (self.tp + self.fn) > 0 52 | if not valid.any(): 53 | return 0 54 | return float(self.iou[valid].mean()) 55 | 56 | @property 57 | def dice(self): 58 | return 2 * self.tp.float() / (2 * self.tp + self.fp + self.fn).float() 59 | 60 | @property 61 | def macro_dice(self): 62 | valid = (self.tp + self.fn) > 0 63 | if not valid.any(): 64 | return 0 65 | return float(self.dice[valid].mean()) 66 | 67 | @property 68 | def precision(self): 69 | return self.tp.float() / (self.tp + self.fp).float() 70 | 71 | @property 72 | def recall(self): 73 | return self.tp.float() / (self.tp + self.fn).float() -------------------------------------------------------------------------------- /src/utils/geometry.py: -------------------------------------------------------------------------------- 1 | from collections import Iterable 2 | import torch 3 | 4 | def make_grid(grid_size, cell_size=None, grid_offset=None): 5 | """Construct an N-dimensional grid""" 6 | 7 | # Handle default or non-tuple cell_sizes 8 | if cell_size is None: 9 | cell_size = [1.] * len(grid_size) 10 | elif not isinstance(cell_size, Iterable): 11 | cell_size = [cell_size] * len(grid_size) 12 | 13 | # By default the grid offset is set to zero 14 | if grid_offset is None: 15 | grid_offset = [0.] * len(grid_size) 16 | 17 | coords = [torch.arange(0, gs, cs) + off for gs, cs, off 18 | in zip(grid_size, cell_size, grid_offset)] 19 | grid = torch.meshgrid(*coords[::-1])[::-1] 20 | return torch.stack(grid, -1) -------------------------------------------------------------------------------- /src/utils/visualise.py: -------------------------------------------------------------------------------- 1 | from matplotlib.cm import get_cmap 2 | 3 | def colorise(tensor, cmap, vmin=None, vmax=None): 4 | 5 | if isinstance(cmap, str): 6 | cmap = get_cmap(cmap) 7 | 8 | tensor = tensor.detach().cpu().float() 9 | 10 | vmin = float(tensor.min()) if vmin is None else vmin 11 | vmax = float(tensor.max()) if vmax is None else vmax 12 | 13 | tensor = (tensor - vmin) / (vmax - vmin) 14 | return cmap(tensor.numpy())[..., :3] -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | from datetime import datetime 4 | from argparse import ArgumentParser 5 | from tqdm import tqdm 6 | 7 | import torch 8 | import torch.nn as nn 9 | from torch.optim import SGD 10 | from torch.optim.lr_scheduler import MultiStepLR 11 | from torch.utils.tensorboard import SummaryWriter 12 | 13 | from src.models.model_factory import build_model, build_criterion 14 | from src.data.data_factory import build_dataloaders 15 | from src.utils.configs import get_default_configuration, load_config 16 | from src.utils.confusion import BinaryConfusionMatrix 17 | from src.data.nuscenes.utils import NUSCENES_CLASS_NAMES 18 | from src.data.argoverse.utils import ARGOVERSE_CLASS_NAMES 19 | from src.utils.visualise import colorise 20 | 21 | def train(dataloader, model, criterion, optimiser, summary, config, epoch): 22 | 23 | model.train() 24 | 25 | # Compute prior probability of occupancy 26 | prior = torch.tensor(config.prior) 27 | prior_log_odds = torch.log(prior / (1 - prior)) 28 | 29 | # Initialise confusion matrix 30 | confusion = BinaryConfusionMatrix(config.num_class) 31 | 32 | # Iterate over dataloader 33 | iteration = (epoch - 1) * len(dataloader) 34 | for i, batch in enumerate(tqdm(dataloader)): 35 | 36 | # Move tensors to GPU 37 | if len(config.gpus) > 0: 38 | batch = [t.cuda() for t in batch] 39 | 40 | # Predict class occupancy scores and compute loss 41 | image, calib, labels, mask = batch 42 | if config.model == 'ved': 43 | logits, mu, logvar = model(image) 44 | loss = criterion(logits, labels, mask, mu, logvar) 45 | else: 46 | logits = model(image, calib) 47 | loss = criterion(logits, labels, mask) 48 | 49 | 50 | # Compute gradients and update parameters 51 | optimiser.zero_grad() 52 | loss.backward() 53 | optimiser.step() 54 | 55 | # Update confusion matrix 56 | scores = logits.cpu().sigmoid() 57 | confusion.update(scores > config.score_thresh, labels, mask) 58 | 59 | # Update tensorboard 60 | if i % config.log_interval == 0: 61 | summary.add_scalar('train/loss', float(loss), iteration) 62 | 63 | # Visualise 64 | if i % config.vis_interval == 0: 65 | visualise(summary, image, scores, labels, mask, iteration, 66 | config.train_dataset, split='train') 67 | 68 | iteration += 1 69 | 70 | # Print and record results 71 | display_results(confusion, config.train_dataset) 72 | log_results(confusion, config.train_dataset, summary, 'train', epoch) 73 | 74 | 75 | 76 | def evaluate(dataloader, model, criterion, summary, config, epoch): 77 | 78 | model.eval() 79 | 80 | # Compute prior probability of occupancy 81 | prior = torch.tensor(config.prior) 82 | prior_log_odds = torch.log(prior / (1 - prior)) 83 | 84 | # Initialise confusion matrix 85 | confusion = BinaryConfusionMatrix(config.num_class) 86 | 87 | # Iterate over dataset 88 | for i, batch in enumerate(tqdm(dataloader)): 89 | 90 | # Move tensors to GPU 91 | if len(config.gpus) > 0: 92 | batch = [t.cuda() for t in batch] 93 | 94 | # Predict class occupancy scores and compute loss 95 | image, calib, labels, mask = batch 96 | with torch.no_grad(): 97 | if config.model == 'ved': 98 | logits, mu, logvar = model(image) 99 | loss = criterion(logits, labels, mask, mu, logvar) 100 | else: 101 | logits = model(image, calib) 102 | loss = criterion(logits, labels, mask) 103 | 104 | # Update confusion matrix 105 | scores = logits.cpu().sigmoid() 106 | confusion.update(scores > config.score_thresh, labels, mask) 107 | 108 | # Update tensorboard 109 | if i % config.log_interval == 0: 110 | summary.add_scalar('val/loss', float(loss), epoch) 111 | 112 | # Visualise 113 | if i % config.vis_interval == 0: 114 | visualise(summary, image, scores, labels, mask, epoch, 115 | config.train_dataset, split='val') 116 | 117 | # Print and record results 118 | display_results(confusion, config.train_dataset) 119 | log_results(confusion, config.train_dataset, summary, 'val', epoch) 120 | 121 | return confusion.mean_iou 122 | 123 | 124 | def visualise(summary, image, scores, labels, mask, step, dataset, split): 125 | 126 | class_names = NUSCENES_CLASS_NAMES if dataset == 'nuscenes' \ 127 | else ARGOVERSE_CLASS_NAMES 128 | 129 | summary.add_image(split + '/image', image[0], step, dataformats='CHW') 130 | summary.add_image(split + '/pred', colorise(scores[0], 'coolwarm', 0, 1), 131 | step, dataformats='NHWC') 132 | summary.add_image(split + '/gt', colorise(labels[0], 'coolwarm', 0, 1), 133 | step, dataformats='NHWC') 134 | 135 | 136 | # for i, name in enumerate(class_names): 137 | # summary.add_image(split + '/pred/' + name, scores[0, i], step, 138 | # dataformats='HW') 139 | # summary.add_image(split + '/gt/' + name, labels[0, i], step, 140 | # dataformats='HW') 141 | 142 | # summary.add_image(split + '/mask', mask[0], step, dataformats='HW') 143 | 144 | 145 | def display_results(confusion, dataset): 146 | 147 | # Display confusion matrix summary 148 | class_names = NUSCENES_CLASS_NAMES if dataset == 'nuscenes' \ 149 | else ARGOVERSE_CLASS_NAMES 150 | 151 | print('\nResults:') 152 | for name, iou_score in zip(class_names, confusion.iou): 153 | print('{:20s} {:.3f}'.format(name, iou_score)) 154 | print('{:20s} {:.3f}'.format('MEAN', confusion.mean_iou)) 155 | 156 | 157 | 158 | def log_results(confusion, dataset, summary, split, epoch): 159 | 160 | # Display and record epoch IoU scores 161 | class_names = NUSCENES_CLASS_NAMES if dataset == 'nuscenes' \ 162 | else ARGOVERSE_CLASS_NAMES 163 | 164 | for name, iou_score in zip(class_names, confusion.iou): 165 | summary.add_scalar(f'{split}/iou/{name}', iou_score, epoch) 166 | summary.add_scalar(f'{split}/iou/MEAN', confusion.mean_iou, epoch) 167 | 168 | 169 | 170 | def save_checkpoint(path, model, optimizer, scheduler, epoch, best_iou): 171 | 172 | if isinstance(model, nn.DataParallel): 173 | model = model.module 174 | 175 | ckpt = { 176 | 'model' : model.state_dict(), 177 | 'optimizer' : optimizer.state_dict(), 178 | 'scheduler' : scheduler.state_dict(), 179 | 'epoch' : epoch, 180 | 'best_iou' : best_iou 181 | } 182 | 183 | torch.save(ckpt, path) 184 | 185 | 186 | def load_checkpoint(path, model, optimizer, scheduler): 187 | 188 | ckpt = torch.load(path) 189 | 190 | # Load model weights 191 | if isinstance(model, nn.DataParallel): 192 | model = model.module 193 | model.load_state_dict(ckpt['model']) 194 | 195 | # Load optimiser state 196 | optimizer.load_state_dict(ckpt['optimizer']) 197 | 198 | # Load scheduler state 199 | scheduler.load_state_dict(ckpt['scheduler']) 200 | 201 | return ckpt['epoch'], ckpt['best_iou'] 202 | 203 | 204 | 205 | # Load the configuration for this experiment 206 | def get_configuration(args): 207 | 208 | # Load config defaults 209 | config = get_default_configuration() 210 | 211 | # Load dataset options 212 | config.merge_from_file(f'configs/datasets/{args.dataset}.yml') 213 | 214 | # Load model options 215 | config.merge_from_file(f'configs/models/{args.model}.yml') 216 | 217 | # Load experiment options 218 | config.merge_from_file(f'configs/experiments/{args.experiment}.yml') 219 | 220 | # Restore config from an existing experiment 221 | if args.resume is not None: 222 | config.merge_from_file(os.path.join(args.resume, 'config.yml')) 223 | 224 | # Override with command line options 225 | config.merge_from_list(args.options) 226 | 227 | # Finalise config 228 | config.freeze() 229 | 230 | return config 231 | 232 | 233 | def create_experiment(config, tag, resume=None): 234 | 235 | # Restore an existing experiment if a directory is specified 236 | if resume is not None: 237 | print("\n==> Restoring experiment from directory:\n" + resume) 238 | logdir = resume 239 | else: 240 | # Otherwise, generate a run directory based on the current time 241 | name = datetime.now().strftime('{}_%y-%m-%d--%H-%M-%S').format(tag) 242 | logdir = os.path.join(os.path.expandvars(config.logdir), name) 243 | print("\n==> Creating new experiment in directory:\n" + logdir) 244 | os.makedirs(logdir) 245 | 246 | # Display the config options on-screen 247 | print(config.dump()) 248 | 249 | # Save the current config 250 | with open(os.path.join(logdir, 'config.yml'), 'w') as f: 251 | f.write(config.dump()) 252 | 253 | return logdir 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | def main(): 263 | 264 | parser = ArgumentParser() 265 | parser.add_argument('--tag', type=str, default='run', 266 | help='optional tag to identify the run') 267 | parser.add_argument('--dataset', choices=['nuscenes', 'argoverse'], 268 | default='nuscenes', help='dataset to train on') 269 | parser.add_argument('--model', choices=['pyramid', 'vpn', 'ved'], 270 | default='pyramid', help='model to train') 271 | parser.add_argument('--experiment', default='test', 272 | help='name of experiment config to load') 273 | parser.add_argument('--resume', default=None, 274 | help='path to an experiment to resume') 275 | parser.add_argument('--options', nargs='*', default=[], 276 | help='list of addition config options as key-val pairs') 277 | args = parser.parse_args() 278 | 279 | # Load configuration 280 | config = get_configuration(args) 281 | 282 | # Create a directory for the experiment 283 | logdir = create_experiment(config, args.tag, args.resume) 284 | 285 | # Create tensorboard summary 286 | summary = SummaryWriter(logdir) 287 | 288 | # Set default device 289 | if len(config.gpus) > 0: 290 | torch.cuda.set_device(config.gpus[0]) 291 | 292 | # Setup experiment 293 | model = build_model(config.model, config) 294 | criterion = build_criterion(config.model, config) 295 | train_loader, val_loader = build_dataloaders(config.train_dataset, config) 296 | 297 | # Build optimiser and learning rate scheduler 298 | optimiser = SGD(model.parameters(), config.learning_rate, 299 | weight_decay=config.weight_decay) 300 | lr_scheduler = MultiStepLR(optimiser, config.lr_milestones, 0.1) 301 | 302 | # Load checkpoint 303 | if args.resume: 304 | epoch, best_iou = load_checkpoint(os.path.join(logdir, 'latest.pth'), 305 | model, optimiser, lr_scheduler) 306 | else: 307 | epoch, best_iou = 1, 0 308 | 309 | # Main training loop 310 | while epoch <= config.num_epochs: 311 | 312 | print('\n\n=== Beginning epoch {} of {} ==='.format(epoch, 313 | config.num_epochs)) 314 | 315 | # Train model for one epoch 316 | train(train_loader, model, criterion, optimiser, summary, config, epoch) 317 | 318 | # Evaluate on the validation set 319 | val_iou = evaluate(val_loader, model, criterion, summary, config, epoch) 320 | 321 | # Update learning rate 322 | lr_scheduler.step() 323 | 324 | # Save checkpoints 325 | if val_iou > best_iou: 326 | best_iou = val_iou 327 | save_checkpoint(os.path.join(logdir, 'best.pth'), model, 328 | optimiser, lr_scheduler, epoch, best_iou) 329 | 330 | save_checkpoint(os.path.join(logdir, 'latest.pth'), model, optimiser, 331 | lr_scheduler, epoch, best_iou) 332 | 333 | epoch += 1 334 | 335 | print("\nTraining complete!") 336 | 337 | 338 | 339 | if __name__ == '__main__': 340 | main() 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | --------------------------------------------------------------------------------