├── .gitignore
├── LICENSE
├── architecture.png
├── configs
    ├── datasets
    │   ├── argoverse.yml
    │   └── nuscenes.yml
    ├── defaults.yml
    ├── experiments
    │   └── test.yml
    └── models
    │   ├── pyramid.yml
    │   ├── ved.yml
    │   └── vpn.yml
├── readme.md
├── scripts
    ├── make_argoverse_labels.py
    └── make_nuscenes_labels.py
├── src
    ├── __init__.py
    ├── data
    │   ├── __init__.py
    │   ├── argoverse
    │   │   ├── __init__.py
    │   │   ├── dataset.py
    │   │   ├── splits.py
    │   │   └── utils.py
    │   ├── augmentation.py
    │   ├── data_factory.py
    │   ├── nuscenes
    │   │   ├── __init__.py
    │   │   ├── dataset.py
    │   │   ├── splits.py
    │   │   └── utils.py
    │   └── utils.py
    ├── models
    │   ├── __init__.py
    │   ├── criterion.py
    │   ├── model_factory.py
    │   ├── pyramid.py
    │   ├── ved.py
    │   └── vpn.py
    ├── nn
    │   ├── __init__.py
    │   ├── classifier.py
    │   ├── fpn.py
    │   ├── losses.py
    │   ├── pyramid.py
    │   ├── resampler.py
    │   ├── resnet.py
    │   ├── topdown.py
    │   └── transformer.py
    └── utils
    │   ├── __init__.py
    │   ├── configs.py
    │   ├── confusion.py
    │   ├── geometry.py
    │   └── visualise.py
└── train.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | .vscode
  2 | notebooks
  3 | logs
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | pip-wheel-metadata/
 28 | share/python-wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # celery beat schedule file
 99 | celerybeat-schedule
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | This work is licensed under the Creative Commons Attribution-NonCommercial 4.0 International License. To view a copy of this license, visit http://creativecommons.org/licenses/by-nc/4.0/.
2 | 
3 | 


--------------------------------------------------------------------------------
/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tom-roddick/mono-semantic-maps/f6b0f52857a4e622ae42bf24faaf9681cc089f76/architecture.png


--------------------------------------------------------------------------------
/configs/datasets/argoverse.yml:
--------------------------------------------------------------------------------
 1 | train_dataset: argoverse
 2 | dataroot: ${DATA_ROOT}/argoverse/argoverse-tracking
 3 | label_root: ${PROCESSED_ROOT}/argoverse/test
 4 | img_size: [960, 600]
 5 | class_weights:
 6 |       -  1.7    # drivable_area
 7 |       -  5.2    # vehicle
 8 |       - 22.0    # pedestrian
 9 |       -  9.6    # large_vehicle
10 |       - 20.3    # bicycle
11 |       -  9.6    # bus
12 |       -  7.0    # trailer
13 |       - 27.5    # motorcycle
14 | 
15 | vpn:
16 |   output_size : [38, 60]
17 | 
18 | ved:
19 |   bottleneck_dim: 28


--------------------------------------------------------------------------------
/configs/datasets/nuscenes.yml:
--------------------------------------------------------------------------------
 1 | train_dataset: nuscenes
 2 | dataroot: ${DATA_ROOT}/nuscenes
 3 | nuscenes_version: v1.0-trainval
 4 | label_root: ${PROCESSED_ROOT}/nuscenes/map-labels-v1.2
 5 | img_size: [800, 600]
 6 | num_class: 14
 7 | class_weights:
 8 |   -    1.7    # drivable_area
 9 |   -    5.9    # ped_crossing
10 |   -    3.3    # walkway
11 |   -    4.6    # carpark
12 |   -    8.0    # car
13 |   -   10.3    # truck
14 |   -   10.6    # bus
15 |   -    6.9    # trailer
16 |   -   11.8    # construction_vehicle
17 |   -   30.1    # pedestrian
18 |   -   33.6    # motorcycle
19 |   -   41.2    # bicycle
20 |   -   44.3    # traffic_cone
21 |   -   15.9    # barrier
22 | 
23 | # Prior probability of a positive prediction, used to initialise classifier
24 | prior: 
25 |   - 0.44679   # drivable_area
26 |   - 0.02407   # ped_crossing
27 |   - 0.14491   # walkway
28 |   - 0.02994   # carpark
29 |   - 0.02086   # car
30 |   - 0.00477   # truck
31 |   - 0.00156   # bus
32 |   - 0.00189   # trailer
33 |   - 0.00084   # construction_vehicle
34 |   - 0.00119   # pedestrian
35 |   - 0.00019   # motorcycle
36 |   - 0.00012   # bicycle
37 |   - 0.00031   # traffic_cone
38 |   - 0.00176   # barrier
39 | 
40 | ved:
41 |   bottleneck_dim: 18
42 | 
43 | vpn:
44 |   output_size : [29, 50]


--------------------------------------------------------------------------------
/configs/defaults.yml:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | ### Training options ###
  4 | 
  5 | # IDs of GPUs to use during training
  6 | gpus: [0, 2, 3, 4]
  7 | 
  8 | # Number of examples per mini-batch
  9 | batch_size: 12
 10 | 
 11 | # Number of dataloader threads
 12 | num_workers: 8
 13 | 
 14 | # Learning rate
 15 | learning_rate: 0.1
 16 | 
 17 | # Decay learning rate by a factor 10 after the following number of epochs
 18 | lr_milestones: [150, 185]
 19 | 
 20 | # Weight decay
 21 | weight_decay: 0.0001
 22 | 
 23 | # Directory to save experiment to
 24 | logdir: logs
 25 | 
 26 | # Number of epochs to train for
 27 | num_epochs: 200
 28 | 
 29 | # Number of examples per epoch
 30 | epoch_size: 50000
 31 | 
 32 | 
 33 | #### Data options ####
 34 | 
 35 | # Dataset to train on
 36 | train_dataset: nuscenes
 37 | 
 38 | # Name of split used for training
 39 | train_split: train
 40 | 
 41 | # Name of split used for validation
 42 | val_split: val
 43 | 
 44 | # Root data directory
 45 | dataroot: ${DATA_ROOT}/nuscenes
 46 | 
 47 | # NuScenes dataset version
 48 | nuscenes_version: v1.0-trainval
 49 | 
 50 | # Directory containing pregenerated training labels
 51 | label_root: ${PROCESSED_ROOT}/nuscenes/map-labels-v1.2
 52 | 
 53 | # Input image size after downsampling
 54 | img_size: [800, 600]
 55 | 
 56 | # Hold out portion of train data to calibrate on
 57 | hold_out_calibration: False
 58 | 
 59 | # Class-specific weighting factors used to balance the cross entropy loss
 60 | class_weights:
 61 |   -    1.7    # drivable_area
 62 |   -    5.9    # ped_crossing
 63 |   -    3.3    # walkway
 64 |   -    4.6    # carpark
 65 |   -    8.0    # car
 66 |   -   10.3    # truck
 67 |   -   10.6    # bus
 68 |   -    6.9    # trailer
 69 |   -   11.8    # construction_vehicle
 70 |   -   30.1    # pedestrian
 71 |   -   33.6    # motorcycle
 72 |   -   41.2    # bicycle
 73 |   -   44.3    # traffic_cone
 74 |   -   15.9    # barrier
 75 | 
 76 | # Prior probability of a positive prediction, used to initialise classifier
 77 | prior:
 78 |   - 0.44679   # drivable_area
 79 |   - 0.02407   # ped_crossing
 80 |   - 0.14491   # walkway
 81 |   - 0.02994   # carpark
 82 |   - 0.02086   # car
 83 |   - 0.00477   # truck
 84 |   - 0.00156   # bus
 85 |   - 0.00189   # trailer
 86 |   - 0.00084   # construction_vehicle
 87 |   - 0.00119   # pedestrian
 88 |   - 0.00019   # motorcycle
 89 |   - 0.00012   # bicycle
 90 |   - 0.00031   # traffic_cone
 91 |   - 0.00176   # barrier
 92 | 
 93 | # Whether to use horizontal flips for data augmentation
 94 | hflip: True
 95 | 
 96 | # Top-left and bottom right coordinates of map region, in meters
 97 | map_extents: [-25., 1., 25., 50.]
 98 | 
 99 | # Spacing between adjacent grid cells in the map, in meters
100 | map_resolution: 0.25
101 | 
102 | # Log loss to tensorboard every N iterations
103 | log_interval: 10
104 | 
105 | # Visualise predictions every N iterations
106 | vis_interval: 200
107 | 
108 | 
109 | ### Model options ###
110 | 
111 | # Architecture to train [pyramid | ved | vpn ]
112 | model: pyramid
113 | 
114 | # Number of intermediate channels in the transformer layer
115 | tfm_channels: 64
116 | 
117 | # Vertical extents of the region of interest, in meters
118 | ymin: -2
119 | ymax: 4
120 | 
121 | # Approximate camera focal length used for constructing transformers
122 | focal_length: 630.
123 | 
124 | # Topdown network options
125 | topdown:
126 | 
127 |   # Number of feature channels at each layer of the topdown network
128 |   channels: 128
129 | 
130 |   # Number of blocks in each layer
131 |   layers: [4, 4]
132 | 
133 |   # Upsampling factor in each stage of the topdown network
134 |   strides: [1, 2]
135 | 
136 |   # Type of residual block to use [ basic | bottleneck ]
137 |   blocktype: bottleneck
138 | 
139 | # Number of output classes to predict
140 | num_class: 14
141 | 
142 | # Whether to use Bayesian classifier
143 | bayesian: False
144 | 
145 | # Number of samples used for Monte-Carlo inference
146 | mc_samples: 40
147 | 
148 | # View parsing network options
149 | vpn:
150 | 
151 |   # Size of output feature maps
152 |   output_size: [29, 50]
153 | 
154 |   # Number of channels in fully connected layer
155 |   fc_dim: 256
156 | 
157 | # Variational encoder-decoder network options
158 | ved:
159 | 
160 |   # Dimensions of bottleneck (depends on the size of input images)
161 |   bottleneck_dim: 18
162 | 
163 | # Loss function
164 | loss_fn: bce
165 | 
166 | # Binary cross entropy loss weight
167 | xent_weight: 1.0
168 | 
169 | # Max entropy uncertainty loss weight
170 | uncert_weight: 0.001
171 | 
172 | # Focal loss parameters
173 | focal:
174 |   alpha: 0.25
175 |   gamma: 2
176 | 
177 | # KL-Divergence loss weight (used by VED network)
178 | kld_weight: 0.0
179 | 
180 | # Method of weighting classes in loss function
181 | weight_mode: sqrt_inverse
182 | 
183 | # Threshold to treat prediction as positive
184 | score_thresh: 0.5
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 


--------------------------------------------------------------------------------
/configs/experiments/test.yml:
--------------------------------------------------------------------------------
1 | logdir: logs
2 | 


--------------------------------------------------------------------------------
/configs/models/pyramid.yml:
--------------------------------------------------------------------------------
 1 | model: pyramid
 2 | tfm_channels: 64
 3 | ymin: -2
 4 | ymax: 4
 5 | focal_length: 630.
 6 | topdown:
 7 |   channels: 128
 8 |   layers: [4, 4]
 9 |   strides: [1, 2]
10 |   blocktype: bottleneck
11 | num_class: 14
12 | 


--------------------------------------------------------------------------------
/configs/models/ved.yml:
--------------------------------------------------------------------------------
1 | model: ved
2 | xent_weight: 0.9
3 | uncert_weight: 0.001
4 | kld_weight: 0.1


--------------------------------------------------------------------------------
/configs/models/vpn.yml:
--------------------------------------------------------------------------------
1 | model: vpn


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # Predicting Semantic Map Representations from Images with Pyramid Occupancy Networks
 2 | 
 3 | This is the code associated with the paper [Predicting Semantic Map Representations from Images with Pyramid Occupancy Networks](https://arxiv.org/pdf/2003.13402.pdf), published at CVPR 2020.
 4 | 
 5 | ![Pyramid Occupancy Network architecture](architecture.png)
 6 | 
 7 | ## Data generation
 8 | In our work we report results on two large-scale autonomous driving datasets: NuScenes and Argoverse. The birds-eye-view ground truth labels we use to train and evaluate our networks are generated by combining map information provided by the two datasets with 3D bounding box annotations, which we rasterise to produce a set of one-hot binary labels. We also make use of LiDAR point clouds to infer regions of the birds-eye-view which are completely occluded by buildings or other objects.
 9 | 
10 | ### NuScenes
11 | To train our method on NuScenes you will first need to
12 | 1. Download the NuScenes dataset which can be found at https://www.nuscenes.org/download. Only the metadata, keyframe and lidar blobs are necessary. 
13 | 2. Download the map expansion pack. Note that to replicate our original results you should use the original version of the expansion (v1.0). The later versions fixed some bugs with the original maps so we would expect even better performance!
14 | 3. Install the NuScenes devkit from https://github.com/nutonomy/nuscenes-devkit
15 | 4. Cd to `mono-semantic-maps`
16 | 5. Edit the `configs/datasets/nuscenes.yml` file, setting the `dataroot` and `label_root` entries to the location of the NuScenes dataset and the desired ground truth folder respectively.
17 | 6. Run our data generation script: `python scripts/make_nuscenes_labels.py`. Bewarned there's a lot of data so this will take a few hours to run! 
18 | 
19 | ### Argoverse
20 | To train on the Argoverse dataset:
21 | 1. Download the Argoverse tracking data from https://www.argoverse.org/data.html#tracking-link. Our models were trained on version 1.1, you will need to download the four training blobs, validation blob, and the HD map data.
22 | 2. Install the Argoverse devkit from https://github.com/argoai/argoverse-api
23 | 3. Cd to `mono-semantic-maps`
24 | 5. Edit the `configs/datasets/argoverse.yml` file, setting the `dataroot` and `label_root` entries to the location of the install Argoverse data and the desired ground truth folder respectively.
25 | 5. Run our data generation script: `python scripts/make_argoverse_labels.py`. This script will also take a while to run! 
26 | 
27 | 
28 | ## Training
29 | Once ground truth labels have been generated, you can train our method by running the `train.py` script in the root directory: 
30 | ```
31 | python train.py --dataset nuscenes --model pyramid
32 | ```
33 | The `--dataset` flag allows you to specify the dataset to train on, either `'argoverse'` or `'nuscenes'`. The model flag allows training of the proposed method `'pyramid'`, or one of the baseline methods (`'vpn'` or `'ved'`). Additional command line options can be specified by passing a list of key-value pairs to the `--options` flag. The full list of configurable options can be found in the `configs/defaults.yml` file. 
34 | 
35 | 


--------------------------------------------------------------------------------
/scripts/make_argoverse_labels.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import numpy as np
 4 | from PIL import Image
 5 | from progressbar import ProgressBar
 6 | 
 7 | from argoverse.map_representation.map_api import ArgoverseMap
 8 | from argoverse.data_loading.argoverse_tracking_loader \
 9 |     import ArgoverseTrackingLoader
10 | from argoverse.utils.camera_stats import RING_CAMERA_LIST
11 | 
12 | sys.path.append(os.path.abspath(os.path.join(__file__, '../..')))
13 | 
14 | from src.utils.configs import get_default_configuration
15 | from src.data.utils import get_visible_mask, get_occlusion_mask, \
16 |     encode_binary_labels
17 | from src.data.argoverse.utils import get_object_masks, get_map_mask
18 | 
19 | 
20 | def process_split(split, map_data, config):
21 | 
22 |     # Create an Argoverse loader instance
23 |     path = os.path.join(os.path.expandvars(config.argoverse.root), split)
24 |     print("Loading Argoverse tracking data at " + path)
25 |     loader = ArgoverseTrackingLoader(path)
26 | 
27 |     for scene in loader:
28 |         process_scene(split, scene, map_data, config)
29 | 
30 | 
31 | def process_scene(split, scene, map_data, config):
32 | 
33 |     print("\n\n==> Processing scene: " + scene.current_log)
34 | 
35 |     i = 0
36 |     progress = ProgressBar(
37 |         max_value=len(RING_CAMERA_LIST) * scene.num_lidar_frame)
38 |     
39 |     # Iterate over each camera and each frame in the sequence
40 |     for camera in RING_CAMERA_LIST:
41 |         for frame in range(scene.num_lidar_frame):
42 |             progress.update(i)
43 |             process_frame(split, scene, camera, frame, map_data, config)
44 |             i += 1
45 |             
46 | 
47 | def process_frame(split, scene, camera, frame, map_data, config):
48 | 
49 |     # Compute object masks
50 |     masks = get_object_masks(scene, camera, frame, config.map_extents,
51 |                              config.map_resolution)
52 |     
53 |     # Compute drivable area mask
54 |     masks[0] = get_map_mask(scene, camera, frame, map_data, config.map_extents,
55 |                             config.map_resolution)
56 |     
57 |     # Ignore regions of the BEV which are outside the image
58 |     calib = scene.get_calibration(camera)
59 |     masks[-1] |= ~get_visible_mask(calib.K, calib.camera_config.img_width,
60 |                                    config.map_extents, config.map_resolution)
61 |     
62 |     # Ignore regions of the BEV which are occluded (based on LiDAR data)
63 |     lidar = scene.get_lidar(frame)
64 |     cam_lidar = calib.project_ego_to_cam(lidar)
65 |     masks[-1] |= get_occlusion_mask(cam_lidar, config.map_extents, 
66 |                                     config.map_resolution)
67 |     
68 |     # Encode masks as an integer bitmask
69 |     labels = encode_binary_labels(masks)
70 | 
71 |     # Create a filename and directory
72 |     timestamp = str(scene.image_timestamp_list_sync[camera][frame])
73 |     output_path = os.path.join(config.argoverse.label_root, split, 
74 |                                scene.current_log, camera, 
75 |                                f'{camera}_{timestamp}.png')
76 |     os.makedirs(os.path.dirname(output_path), exist_ok=True)
77 | 
78 |     # Save encoded label file to disk
79 |     Image.fromarray(labels.astype(np.int32), mode='I').save(output_path)
80 |     
81 | 
82 | if __name__ == '__main__':
83 | 
84 |     config = get_default_configuration()
85 |     config.merge_from_file('configs/datasets/argoverse.yml')
86 | 
87 |     # Create an Argoverse map instance
88 |     map_data = ArgoverseMap()
89 | 
90 |     for split in ['train', 'val']:
91 |         process_split(split, map_data, config)
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/scripts/make_nuscenes_labels.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import numpy as np
  4 | from PIL import Image
  5 | from tqdm import tqdm
  6 | from collections import OrderedDict
  7 | 
  8 | from shapely.strtree import STRtree
  9 | from nuscenes import NuScenes
 10 | from nuscenes.map_expansion.map_api import NuScenesMap
 11 | 
 12 | sys.path.append(os.path.abspath(os.path.join(__file__, '../..')))
 13 | 
 14 | from src.utils.configs import get_default_configuration
 15 | from src.data.utils import get_visible_mask, get_occlusion_mask, transform, \
 16 |     encode_binary_labels
 17 | import src.data.nuscenes.utils as nusc_utils
 18 | 
 19 | 
 20 | def process_scene(nuscenes, map_data, scene, config):
 21 | 
 22 |     # Get the map corresponding to the current sample data
 23 |     log = nuscenes.get('log', scene['log_token'])
 24 |     scene_map_data = map_data[log['location']]
 25 | 
 26 |     # Iterate over samples
 27 |     first_sample_token = scene['first_sample_token']
 28 |     for sample in nusc_utils.iterate_samples(nuscenes, first_sample_token):
 29 |         process_sample(nuscenes, scene_map_data, sample, config)
 30 | 
 31 | 
 32 | def process_sample(nuscenes, map_data, sample, config):
 33 | 
 34 |     # Load the lidar point cloud associated with this sample
 35 |     lidar_data = nuscenes.get('sample_data', sample['data']['LIDAR_TOP'])
 36 |     lidar_pcl = nusc_utils.load_point_cloud(nuscenes, lidar_data)
 37 | 
 38 |     # Transform points into world coordinate system
 39 |     lidar_transform = nusc_utils.get_sensor_transform(nuscenes, lidar_data)
 40 |     lidar_pcl = transform(lidar_transform, lidar_pcl)
 41 | 
 42 |     # Iterate over sample data
 43 |     for camera in nusc_utils.CAMERA_NAMES:
 44 |         sample_data = nuscenes.get('sample_data', sample['data'][camera])
 45 |         process_sample_data(nuscenes, map_data, sample_data, lidar_pcl, config)
 46 | 
 47 | 
 48 | def process_sample_data(nuscenes, map_data, sample_data, lidar, config):
 49 | 
 50 |     # Render static road geometry masks
 51 |     map_masks = nusc_utils.get_map_masks(nuscenes, 
 52 |                                          map_data, 
 53 |                                          sample_data, 
 54 |                                          config.map_extents, 
 55 |                                          config.map_resolution)
 56 |     
 57 |     # Render dynamic object masks
 58 |     obj_masks = nusc_utils.get_object_masks(nuscenes, 
 59 |                                             sample_data, 
 60 |                                             config.map_extents, 
 61 |                                             config.map_resolution)
 62 |     masks = np.concatenate([map_masks, obj_masks], axis=0)
 63 | 
 64 |     # Ignore regions of the BEV which are outside the image
 65 |     sensor = nuscenes.get('calibrated_sensor', 
 66 |                           sample_data['calibrated_sensor_token'])
 67 |     intrinsics = np.array(sensor['camera_intrinsic'])
 68 |     masks[-1] |= ~get_visible_mask(intrinsics, sample_data['width'], 
 69 |                                    config.map_extents, config.map_resolution)
 70 |     
 71 |     # Transform lidar points into camera coordinates
 72 |     cam_transform = nusc_utils.get_sensor_transform(nuscenes, sample_data)
 73 |     cam_points = transform(np.linalg.inv(cam_transform), lidar)
 74 |     masks[-1] |= get_occlusion_mask(cam_points, config.map_extents,
 75 |                                     config.map_resolution)
 76 |     
 77 |     # Encode masks as integer bitmask
 78 |     labels = encode_binary_labels(masks)
 79 | 
 80 |     # Save outputs to disk
 81 |     output_path = os.path.join(os.path.expandvars(config.label_root),
 82 |                                sample_data['token'] + '.png')
 83 |     Image.fromarray(labels.astype(np.int32), mode='I').save(output_path)
 84 | 
 85 | 
 86 | def load_map_data(dataroot, location):
 87 | 
 88 |     # Load the NuScenes map object
 89 |     nusc_map = NuScenesMap(dataroot, location)
 90 | 
 91 |     map_data = OrderedDict()
 92 |     for layer in nusc_utils.STATIC_CLASSES:
 93 |         
 94 |         # Retrieve all data associated with the current layer
 95 |         records = getattr(nusc_map, layer)
 96 |         polygons = list()
 97 | 
 98 |         # Drivable area records can contain multiple polygons
 99 |         if layer == 'drivable_area':
100 |             for record in records:
101 | 
102 |                 # Convert each entry in the record into a shapely object
103 |                 for token in record['polygon_tokens']:
104 |                     poly = nusc_map.extract_polygon(token)
105 |                     if poly.is_valid:
106 |                         polygons.append(poly)
107 |         else:
108 |             for record in records:
109 | 
110 |                 # Convert each entry in the record into a shapely object
111 |                 poly = nusc_map.extract_polygon(record['polygon_token'])
112 |                 if poly.is_valid:
113 |                     polygons.append(poly)
114 | 
115 |         
116 |         # Store as an R-Tree for fast intersection queries
117 |         map_data[layer] = STRtree(polygons)
118 |     
119 |     return map_data
120 | 
121 | 
122 | 
123 | 
124 | 
125 | if __name__ == '__main__':
126 | 
127 |     # Load the default configuration
128 |     config = get_default_configuration()
129 |     config.merge_from_file('configs/datasets/nuscenes.yml')
130 | 
131 |     # Load NuScenes dataset
132 |     dataroot = os.path.expandvars(config.dataroot)
133 |     nuscenes = NuScenes(config.nuscenes_version, dataroot)
134 | 
135 |     # Preload NuScenes map data
136 |     map_data = { location : load_map_data(dataroot, location) 
137 |                  for location in nusc_utils.LOCATIONS }
138 |     
139 |     # Create a directory for the generated labels
140 |     output_root = os.path.expandvars(config.label_root)
141 |     os.makedirs(output_root, exist_ok=True)
142 |     
143 |    # print(nuscenes.scene)
144 |     # Iterate over NuScene scenes
145 |     print("\nGenerating labels...")
146 |     for scene in tqdm(nuscenes.scene):
147 |         process_scene(nuscenes, map_data, scene, config)
148 | 
149 | 
150 | 
151 | 
152 |     
153 | 
154 | 
155 |     
156 | 
157 | 
158 | 
159 | 
160 | 
161 |     
162 | 
163 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tom-roddick/mono-semantic-maps/f6b0f52857a4e622ae42bf24faaf9681cc089f76/src/__init__.py


--------------------------------------------------------------------------------
/src/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tom-roddick/mono-semantic-maps/f6b0f52857a4e622ae42bf24faaf9681cc089f76/src/data/__init__.py


--------------------------------------------------------------------------------
/src/data/argoverse/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tom-roddick/mono-semantic-maps/f6b0f52857a4e622ae42bf24faaf9681cc089f76/src/data/argoverse/__init__.py


--------------------------------------------------------------------------------
/src/data/argoverse/dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from PIL import Image
  3 | import torch
  4 | from torch.utils.data import Dataset
  5 | from torchvision.transforms.functional import to_tensor
  6 | from argoverse.data_loading.argoverse_tracking_loader \
  7 |     import ArgoverseTrackingLoader
  8 | from argoverse.utils.camera_stats import RING_CAMERA_LIST
  9 | 
 10 | from .utils import IMAGE_WIDTH, IMAGE_HEIGHT, ARGOVERSE_CLASS_NAMES
 11 | from ..utils import decode_binary_labels
 12 | 
 13 | 
 14 | class ArgoverseMapDataset(Dataset):
 15 | 
 16 |     def __init__(self, argo_loaders, label_root, image_size=[960, 600], 
 17 |                  log_names=None):
 18 | 
 19 |         self.label_root = label_root
 20 |         self.image_size = image_size
 21 | 
 22 |         self.examples = dict()
 23 |         self.calibs = dict()
 24 | 
 25 |         # Preload training examples from Argoverse train and test sets
 26 |         self.loaders = argo_loaders
 27 |         for split, loader in loaders.items():
 28 |             self.preload(split, loader, log_names)
 29 | 
 30 | 
 31 |     def preload(self, split, loader, log_names=None):
 32 | 
 33 |         # Iterate over sequences
 34 |         for log in loader:
 35 | 
 36 |             # Check if the log is within the current dataset split
 37 |             logid = log.current_log
 38 |             if log_names is not None and logid not in log_names:
 39 |                 continue
 40 | 
 41 |             self.calibs[logid] = dict()
 42 |             for camera, timestamps in log.image_timestamp_list_sync.items():
 43 | 
 44 |                 if camera not in RING_CAMERA_LIST:
 45 |                     continue
 46 | 
 47 |                 # Load image paths
 48 |                 for timestamp in timestamps:
 49 |                     self.examples[timestamp] = (split, logid, camera)
 50 |     
 51 | 
 52 |     def __len__(self):
 53 |         return len(self.examples)
 54 |     
 55 | 
 56 |     def __getitem__(self, timestamp):
 57 | 
 58 |         # Get the split, log and camera ids corresponding to the given timestamp
 59 |         split, log, camera = self.examples[timestamp]
 60 | 
 61 |         image = self.load_image(split, log, camera, timestamp)
 62 |         calib = self.load_calib(split, log, camera)
 63 |         labels, mask = self.load_labels(split, log, camera, timestamp)
 64 | 
 65 |         return image, calib, labels, mask
 66 | 
 67 | 
 68 |     def load_image(self, split, log, camera, timestamp):
 69 |         
 70 |         # Load image
 71 |         loader = self.loaders[split]
 72 |         image = loader.get_image_at_timestamp(timestamp, camera, log)
 73 |         
 74 |         # Resize to the desired dimensions
 75 |         image = image.resize(self.image_size)
 76 | 
 77 |         return to_tensor(image)
 78 |     
 79 | 
 80 |     def load_calib(self, split, log, camera):
 81 | 
 82 |         # Get the loader for the current split
 83 |         loader = self.loaders[split]
 84 | 
 85 |         # Get intrinsics matrix and rescale to account for downsampling
 86 |         calib = loader.get_calibration(camera, log).K[:,:3]
 87 |         calib[0] *= self.image_size[0] / IMAGE_WIDTH
 88 |         calib[1] *= self.image_size[1] / IMAGE_HEIGHT
 89 |         
 90 |         # Convert to a torch tensor
 91 |         return torch.from_numpy(calib)
 92 |     
 93 | 
 94 |     def load_labels(self, split, log, camera, timestamp):
 95 | 
 96 |         # Construct label path from example data
 97 |         label_path = os.path.join(self.label_root, split, log, camera, 
 98 |                                   timestamp, f'{camera}_{timestamp}.png')
 99 |         
100 |         # Load encoded label image as a torch tensor
101 |         encoded_labels = to_tensor(Image.open(label_path)).long()
102 | 
103 |         # Decode to binary labels
104 |         num_class = len(ARGOVERSE_CLASS_NAMES)
105 |         labels = decode_binary_labels(encoded_labels, num_class+ 1)
106 |         labels, mask = labels[:-1], ~labels[-1]
107 | 
108 |         return labels, mask
109 |         
110 |     
111 | 
112 | 
113 | 
114 | 
115 | 
116 |     


--------------------------------------------------------------------------------
/src/data/argoverse/splits.py:
--------------------------------------------------------------------------------
 1 | 
 2 | TRAIN_LOGS = [
 3 |     "26d141ec-f952-3908-b4cc-ae359377424e",
 4 |     "10b8dee6-778f-33e4-a946-d842d2d9c3d7",
 5 |     "273c1883-673a-36bf-b124-88311b1a80be",
 6 |     "f3fb839e-0aa2-342b-81c3-312b80be44f9",
 7 |     "a073e840-6319-3f0b-843e-f6dccdcc7b77",
 8 |     "c6911883-1843-3727-8eaa-41dc8cda8993",
 9 |     "dcdcd8b3-0ba1-3218-b2ea-7bb965aad3f0",
10 |     "043aeba7-14e5-3cde-8a5c-639389b6d3a6",
11 |     "230970eb-dc2e-3133-b252-ff3c6f5d4284",
12 |     "b3def699-884b-3c9e-87e1-1ab76c618e0b",
13 |     "e17eed4f-3ffd-3532-ab89-41a3f24cf226",
14 |     "8a15674a-ae5c-38e2-bc4b-f4156d384072",
15 |     "11953248-1195-1195-1195-511954366464",
16 |     "3d20ae25-5b29-320d-8bae-f03e9dc177b9",
17 |     "10b3a1d8-e56c-38be-aaf7-ef2f862a5c4e",
18 |     "02cf0ce1-699a-373b-86c0-eb6fd5f4697a",
19 |     "08a8b7f0-c317-3bdb-b3dc-b7c9b6d033e2",
20 |     "15c802a9-0f0e-3c87-b516-a3fa02f1ecb0",
21 |     "0ef28d5c-ae34-370b-99e7-6709e1c4b929",
22 |     "22160544-2216-2216-2216-722161741824",
23 |     "38b2c7ef-069b-3d9d-bbeb-8847b8c89fb6",
24 |     "45753856-4575-4575-4575-345754906624",
25 |     "53037376-5303-5303-5303-553038557184",
26 |     "5c251c22-11b2-3278-835c-0cf3cdee3f44",
27 |     "74750688-7475-7475-7475-474752397312",
28 |     "75756160-7575-7575-7575-675757273088",
29 |     "bae67a44-0f30-30c1-8999-06fc1c7ab80a",
30 |     "e8ce69b2-36ab-38e8-87a4-b9e20fee7fd2",
31 |     "ebe7a98b-d383-343b-96d6-9e681e2c6a36",
32 |     "f0826a9f-f46e-3c27-97af-87a77f7899cd",
33 |     "fa0b626f-03df-35a0-8447-021088814b8b",
34 |     "10f92308-e06e-3725-a302-4b09e6e790ad",
35 |     "29789600-2979-2979-2979-429790834688",
36 |     "6162d72f-2990-3a30-9bba-19bbd882985c",
37 |     "649750f3-0163-34eb-a102-7aaf5384eaec",
38 |     "6c739f57-96d0-33e6-972d-af29cc527e1f",
39 |     "95731808-9573-9573-9573-295732883456",
40 |     "a6cab660-f086-3e2a-8ad9-7144f93f5b68",
41 |     "aebe6aaa-6a95-39e6-9a8d-06103141fcde",
42 |     "af706af1-a226-3f6f-8d65-b1f4b9457c48",
43 |     "e9bb51af-1112-34c2-be3e-7ebe826649b4",
44 |     "ff78e1a3-6deb-34a4-9a1f-b85e34980f06",
45 |     "2bc6a872-9979-3493-82eb-fb55407473c9",
46 |     "2c07fcda-6671-3ac0-ac23-4a232e0e031e",
47 |     "49d66e75-3ce6-316b-b589-f659c7ef5e6d",
48 |     "91326240-9132-9132-9132-591327440896",
49 |     "5ab2697b-6e3e-3454-a36a-aba2c6f27818",
50 |     "e9a96218-365b-3ecd-a800-ed2c4c306c78",
51 |     "cb0cba51-dfaf-34e9-a0c2-d931404c3dd8",
52 |     "b1ca08f1-24b0-3c39-ba4e-d5a92868462c",
53 |     "1d676737-4110-3f7e-bec0-0c90f74c248f",
54 |     "da734d26-8229-383f-b685-8086e58d1e05",
55 |     "cd5bb988-092e-396c-8f33-e30969c98535",
56 |     "f9fa3960-537f-3151-a1a3-37a9c0d6d7f7",
57 |     "aeb73d7a-8257-3225-972e-99307b3a5cb0",
58 |     "39556000-3955-3955-3955-039557148672",
59 |     "c9d6ebeb-be15-3df8-b6f1-5575bea8e6b9",
60 |     "5f317f5f-3ce9-355b-acf9-386a8c682252",
61 |     "64724064-6472-6472-6472-764725145600",
62 |     "cd64733a-dd8a-3bdf-b46a-b7144226168a",
63 |     "6db21fda-80cd-3f85-b4a7-0aadeb14724d",
64 |     "f1008c18-e76e-3c24-adcc-da9858fac145",
65 |     "00c561b9-2057-358d-82c6-5b06d76cebcf",
66 |     "cb762bb1-7ce1-3ba5-b53d-13c159b532c8",
67 |     "2d12da1d-5238-3870-bfbc-b281d5e8c1a1"
68 | ]
69 | 
70 | 
71 | VAL_LOGS = [
72 |     "6f153f9c-edc5-389f-ac6f-40705c30d97e",
73 |     "25952736-2595-2595-2595-225953853440",
74 |     "88538208-8853-8853-8853-388539396096",
75 |     "84c35ea7-1a99-3a0c-a3ea-c5915d68acbc",
76 |     "64c12551-adb9-36e3-a0c1-e43a0e9f3845",
77 |     "3138907e-1f8a-362f-8f3d-773f795a0d01",
78 |     "4137e94a-c5da-38bd-ad06-6d57b24bccd0",
79 |     "53213cf0-540b-3b5a-9900-d24d1d41bda0",
80 |     "577ea60d-7cc0-34a4-a8ff-0401e5ab9c62",
81 |     "d60558d2-d1aa-34ee-a902-e061e346e02a",
82 |     "fb471bd6-7c81-3d93-ad12-ac54a28beb84",
83 |     "52af191b-ba56-326c-b569-e37790db40f3",
84 |     "919be600-da69-3f09-b0fd-f42f7eb2e097",
85 |     "99c45b6e-6fc7-39b8-80d7-727c485fb561",
86 |     "9da4ca63-f524-3b38-8c8b-624f17518574",
87 |     "ba067318-0d89-34b5-b577-b171b1a4212b",
88 |     "cd38ac0b-c5a6-3743-a148-f4f7b804ed17",
89 |     "d4d9e91f-0f8e-334d-bd0e-0d062467308a",
90 |     "de777454-df62-3d5a-a1ce-2edb5e5d4922",
91 |     "70d2aea5-dbeb-333d-b21e-76a7f2f1ba1c",
92 |     "033669d3-3d6b-3d3d-bd93-7985d86653ea",
93 |     "7d37fc6b-1028-3f6f-b980-adb5fa73021e",
94 |     "33737504-3373-3373-3373-633738571776",
95 |     "85bc130b-97ae-37fb-a129-4fc07c80cca7"
96 | ]


--------------------------------------------------------------------------------
/src/data/argoverse/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.ndimage import affine_transform
  3 | from ..utils import render_polygon
  4 | 
  5 | 
  6 | # Define Argoverse-specific constants
  7 | IMAGE_WIDTH = 1920
  8 | IMAGE_HEIGHT = 1200
  9 | 
 10 | ARGOVERSE_CLASS_NAMES = [
 11 |     'drivable_area', 'vehicle', 'pedestrian', 'large_vehicle', 'bicycle', 'bus',
 12 |     'trailer', 'motorcycle',
 13 | ]
 14 | 
 15 | ARGOVERSE_CLASS_MAPPING = {
 16 |     'VEHICLE' : 'vehicle',
 17 |     'PEDESTRIAN' : 'pedestrian',
 18 |     # 'ON_ROAD_OBSTACLE' : 'ignore',
 19 |     'LARGE_VEHICLE' : 'large_vehicle',
 20 |     'BICYCLE' : 'bicycle',
 21 |     'BICYCLIST' : 'bicycle',
 22 |     'BUS' : 'bus',
 23 |     # 'OTHER_MOVER' : 'ignore',
 24 |     'TRAILER' : 'trailer',
 25 |     'MOTORCYCLIST' : 'motorcycle',
 26 |     'MOPED' : 'motorcycle',
 27 |     'MOTORCYCLE' : 'motorcycle',
 28 |     # 'STROLLER' : 'ignore',
 29 |     'EMERGENCY_VEHICLE' : 'vehicle',
 30 |     # 'ANIMAL' : 'ignore',
 31 | }
 32 | 
 33 | def argoverse_name_to_class_id(name):
 34 |     if name in ARGOVERSE_CLASS_MAPPING:
 35 |         return ARGOVERSE_CLASS_NAMES.index(ARGOVERSE_CLASS_MAPPING[name])
 36 |     else:
 37 |         return -1
 38 | 
 39 | 
 40 | def get_object_masks(scene, camera, frame, extents, resolution):
 41 | 
 42 |     # Get the dimensions of the birds-eye-view mask
 43 |     x1, z1, x2, z2 = extents
 44 |     mask_width = int((x2 - x1) / resolution)
 45 |     mask_height = int((z2 - z1) / resolution)
 46 | 
 47 |     # Initialise masks
 48 |     num_class = len(ARGOVERSE_CLASS_NAMES)
 49 |     masks = np.zeros((num_class + 1, mask_height, mask_width), dtype=np.uint8)
 50 | 
 51 |     # Get calibration information
 52 |     calib = scene.get_calibration(camera)
 53 | 
 54 |     # Iterate over objects in the scene
 55 |     for obj in scene.get_label_object(frame):
 56 | 
 57 |         # Get the bounding box and convert into camera coordinates
 58 |         bbox = obj.as_2d_bbox()[[0, 1, 3, 2]]
 59 |         cam_bbox = calib.project_ego_to_cam(bbox)[:, [0, 2]]
 60 | 
 61 |         # Render the bounding box to the appropriate mask layer
 62 |         class_id = argoverse_name_to_class_id(obj.label_class)
 63 |         render_bbox(masks[class_id], cam_bbox, extents, resolution)
 64 |     
 65 |     return masks.astype(np.bool)
 66 | 
 67 | 
 68 | def get_map_mask(scene, camera, frame, map_data, extents, resolution):
 69 | 
 70 |     # Get the dimensions of the birds-eye-view mask
 71 |     x1, z1, x2, z2 = extents
 72 |     mask_width = int((x2 - x1) / resolution)
 73 |     mask_height = int((z2 - z1) / resolution)
 74 | 
 75 |     # Get rasterised map
 76 |     city_mask, map_tfm = map_data.get_rasterized_driveable_area(scene.city_name)
 77 | 
 78 |     # Get 3D transform from camera to world coordinates
 79 |     extrinsic = scene.get_calibration(camera).extrinsic
 80 |     pose = scene.get_pose(frame).transform_matrix
 81 |     cam_to_world_tfm = np.matmul(pose, np.linalg.inv(extrinsic))
 82 | 
 83 |     # Get 2D affine transform from camera to map coordinates
 84 |     cam_to_map_tfm = np.matmul(map_tfm, cam_to_world_tfm[[0, 1, 3]])
 85 |     
 86 |     # Get 2D affine transform from BEV coords to map coords
 87 |     bev_to_cam_tfm = np.array([[resolution, 0, x1], 
 88 |                                [0, resolution, z1], 
 89 |                                [0, 0, 1]])
 90 |     bev_to_map_tfm = np.matmul(cam_to_map_tfm[:, [0, 2, 3]], bev_to_cam_tfm)
 91 | 
 92 |     # Warp map image to bev coordinate system
 93 |     mask = affine_transform(city_mask, bev_to_map_tfm[[1, 0]], 
 94 |                             output_shape=(mask_width, mask_height)).T
 95 |     return mask[None]
 96 | 
 97 | 
 98 | 
 99 |     
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/src/data/augmentation.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import Dataset
 3 | 
 4 | class AugmentedMapDataset(Dataset):
 5 | 
 6 |     def __init__(self, dataset, hflip=True):
 7 |         self.dataset = dataset
 8 |         self.hflip = hflip
 9 |     
10 |     def __len__(self):
11 |         return len(self.dataset)
12 |     
13 |     def __getitem__(self, index):
14 |         image, calib, labels, mask = self.dataset[index]
15 | 
16 |         # Apply data augmentation
17 |         if self.hflip:
18 |             image, labels, mask = random_hflip(image, labels, mask)
19 | 
20 |         return image, calib, labels, mask
21 | 
22 |     
23 | def random_hflip(image, labels, mask):
24 |     image = torch.flip(image, (-1,))
25 |     labels = torch.flip(labels.int(), (-1,)).bool()
26 |     mask = torch.flip(mask.int(), (-1,)).bool()
27 |     return image, labels, mask


--------------------------------------------------------------------------------
/src/data/data_factory.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from torch.utils.data import DataLoader, RandomSampler
 3 | from .augmentation import AugmentedMapDataset
 4 | 
 5 | from nuscenes import NuScenes
 6 | from .nuscenes.dataset import NuScenesMapDataset
 7 | from .nuscenes.splits import TRAIN_SCENES, VAL_SCENES, CALIBRATION_SCENES
 8 | 
 9 | from argoverse.data_loading.argoverse_tracking_loader import ArgoverseTrackingLoader
10 | from .argoverse.dataset import ArgoverseMapDataset
11 | from .argoverse.splits import TRAIN_LOGS, VAL_LOGS
12 | 
13 | 
14 | def build_nuscenes_datasets(config):
15 |     print('==> Loading NuScenes dataset...')
16 |     nuscenes = NuScenes(config.nuscenes_version, 
17 |                         os.path.expandvars(config.dataroot))
18 |     
19 |     # Exclude calibration scenes
20 |     if config.hold_out_calibration:
21 |         train_scenes = list(set(TRAIN_SCENES) - set(CALIBRATION_SCENES))
22 |     else:
23 |         train_scenes = TRAIN_SCENES
24 |     
25 |     train_data = NuScenesMapDataset(nuscenes, config.label_root, 
26 |                                     config.img_size, train_scenes)
27 |     val_data = NuScenesMapDataset(nuscenes, config.label_root, 
28 |                                   config.img_size, VAL_SCENES)
29 |     return train_data, val_data
30 | 
31 | 
32 | def build_argoverse_datasets(config):
33 |     print('==> Loading Argoverse dataset...')
34 |     dataroot = os.path.expandvars(config.dataroot)
35 |     
36 |     # Load native argoverse splits
37 |     loaders = {
38 |         'train' : ArgoverseTrackingLoader(os.path.join(dataroot, 'train')),
39 |         'val' : ArgoverseTrackingLoader(os.path.join(dataroot, 'val'))
40 |     }
41 | 
42 |     # Create datasets using new argoverse splits
43 |     train_data = ArgoverseMapDataset(loaders, config.label_root, 
44 |                                      config.img_size, TRAIN_LOGS)
45 |     val_data = ArgoverseMapDataset(loaders, config.label_root, 
46 |                                    config.img_size, VAL_LOGS)
47 |     return train_data, val_data
48 | 
49 | 
50 | def build_datasets(dataset_name, config):
51 |     if dataset_name == 'nuscenes':
52 |         return build_nuscenes_datasets(config)
53 |     elif dataset_name == 'argoverse':
54 |         return build_argoverse_datasets(config)
55 |     else:
56 |         raise ValueError(f"Unknown dataset option '{dataset_name}'")
57 | 
58 | 
59 | 
60 | def build_trainval_datasets(dataset_name, config):
61 | 
62 |     # Construct the base dataset
63 |     train_data, val_data = build_datasets(dataset_name, config)
64 | 
65 |     # Add data augmentation to train dataset
66 |     train_data = AugmentedMapDataset(train_data, config.hflip)
67 | 
68 |     return train_data, val_data
69 | 
70 | 
71 | def build_dataloaders(dataset_name, config):
72 | 
73 |     # Build training and validation datasets
74 |     train_data, val_data = build_trainval_datasets(dataset_name, config)
75 | 
76 |     # Create training set dataloader
77 |     sampler = RandomSampler(train_data, True, config.epoch_size)
78 |     train_loader = DataLoader(train_data, config.batch_size, sampler=sampler,
79 |                               num_workers=config.num_workers)
80 |     
81 |     # Create validation dataloader
82 |     val_loader = DataLoader(val_data, config.batch_size, 
83 |                             num_workers=config.num_workers)
84 |     
85 |     return train_loader, val_loader
86 | 
87 |     
88 | 
89 | 
90 |     
91 | 
92 | 


--------------------------------------------------------------------------------
/src/data/nuscenes/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tom-roddick/mono-semantic-maps/f6b0f52857a4e622ae42bf24faaf9681cc089f76/src/data/nuscenes/__init__.py


--------------------------------------------------------------------------------
/src/data/nuscenes/dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | from torch.utils.data import Dataset
  4 | from PIL import Image, ImageFile
  5 | from nuscenes import NuScenes
  6 | from torchvision.transforms.functional import to_tensor
  7 | 
  8 | from .utils import CAMERA_NAMES, NUSCENES_CLASS_NAMES, iterate_samples
  9 | from ..utils import decode_binary_labels
 10 | 
 11 | class NuScenesMapDataset(Dataset):
 12 | 
 13 |     def __init__(self, nuscenes, map_root,  image_size=(800, 450), 
 14 |                  scene_names=None):
 15 |         
 16 |         self.nuscenes = nuscenes
 17 |         self.map_root = os.path.expandvars(map_root)
 18 |         self.image_size = image_size
 19 | 
 20 |         # Preload the list of tokens in the dataset
 21 |         self.get_tokens(scene_names)
 22 | 
 23 |         # Allow PIL to load partially corrupted images
 24 |         # (otherwise training crashes at the most inconvenient possible times!)
 25 |         ImageFile.LOAD_TRUNCATED_IMAGES = True
 26 |     
 27 | 
 28 |     def get_tokens(self, scene_names=None):
 29 |         
 30 |         self.tokens = list()
 31 | 
 32 |         # Iterate over scenes
 33 |         for scene in self.nuscenes.scene:
 34 |             
 35 |             # Ignore scenes which don't belong to the current split
 36 |             if scene_names is not None and scene['name'] not in scene_names:
 37 |                 continue
 38 |              
 39 |             # Iterate over samples
 40 |             for sample in iterate_samples(self.nuscenes, 
 41 |                                           scene['first_sample_token']):
 42 |                 
 43 |                 # Iterate over cameras
 44 |                 for camera in CAMERA_NAMES:
 45 |                     self.tokens.append(sample['data'][camera])
 46 |         
 47 |         return self.tokens
 48 | 
 49 | 
 50 |     def __len__(self):
 51 |         return len(self.tokens)
 52 | 
 53 |     def __getitem__(self, index):
 54 |         token = self.tokens[index]
 55 | 
 56 |         image = self.load_image(token)
 57 |         calib = self.load_calib(token)
 58 |         labels, mask = self.load_labels(token)
 59 | 
 60 |         return image, calib, labels, mask
 61 | 
 62 |     
 63 |     def load_image(self, token):
 64 | 
 65 |         # Load image as a PIL image
 66 |         image = Image.open(self.nuscenes.get_sample_data_path(token))
 67 | 
 68 |         # Resize to input resolution
 69 |         image = image.resize(self.image_size)
 70 | 
 71 |         # Convert to a torch tensor
 72 |         return to_tensor(image)
 73 |     
 74 | 
 75 |     def load_calib(self, token):
 76 | 
 77 |         # Load camera intrinsics matrix
 78 |         sample_data = self.nuscenes.get('sample_data', token)
 79 |         sensor = self.nuscenes.get(
 80 |             'calibrated_sensor', sample_data['calibrated_sensor_token'])
 81 |         intrinsics = torch.tensor(sensor['camera_intrinsic'])
 82 | 
 83 |         # Scale calibration matrix to account for image downsampling
 84 |         intrinsics[0] *= self.image_size[0] / sample_data['width']
 85 |         intrinsics[1] *= self.image_size[1] / sample_data['height']
 86 |         return intrinsics
 87 |     
 88 | 
 89 |     def load_labels(self, token):
 90 | 
 91 |         # Load label image as a torch tensor
 92 |         label_path = os.path.join(self.map_root, token + '.png')
 93 |         encoded_labels = to_tensor(Image.open(label_path)).long()
 94 | 
 95 |         # Decode to binary labels
 96 |         num_class = len(NUSCENES_CLASS_NAMES)
 97 |         labels = decode_binary_labels(encoded_labels, num_class + 1)
 98 |         labels, mask = labels[:-1], ~labels[-1]
 99 | 
100 |         return labels, mask
101 |     
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/src/data/nuscenes/splits.py:
--------------------------------------------------------------------------------
  1 | 
  2 | TRAIN_SCENES = [
  3 |     "scene-0002", "scene-0003", "scene-0004", "scene-0005", "scene-0006", 
  4 |     "scene-0007", "scene-0008", "scene-0009", "scene-0012", "scene-0013", 
  5 |     "scene-0014", "scene-0015", "scene-0016", "scene-0017", "scene-0018", 
  6 |     "scene-0019", "scene-0021", "scene-0022", "scene-0023", "scene-0024", 
  7 |     "scene-0025", "scene-0026", "scene-0027", "scene-0028", "scene-0029", 
  8 |     "scene-0030", "scene-0031", "scene-0032", "scene-0033", "scene-0034", 
  9 |     "scene-0035", "scene-0036", "scene-0039", "scene-0042", "scene-0043", 
 10 |     "scene-0044", "scene-0045", "scene-0046", "scene-0047", "scene-0048", 
 11 |     "scene-0049", "scene-0050", "scene-0051", "scene-0052", "scene-0055", 
 12 |     "scene-0056", "scene-0057", "scene-0058", "scene-0059", "scene-0060", 
 13 |     "scene-0061", "scene-0062", "scene-0063", "scene-0064", "scene-0065", 
 14 |     "scene-0066", "scene-0067", "scene-0068", "scene-0069", "scene-0070", 
 15 |     "scene-0071", "scene-0072", "scene-0073", "scene-0074", "scene-0075", 
 16 |     "scene-0076", "scene-0092", "scene-0093", "scene-0094", "scene-0095", 
 17 |     "scene-0096", "scene-0097", "scene-0098", "scene-0099", "scene-0100", 
 18 |     "scene-0101", "scene-0102", "scene-0103", "scene-0104", "scene-0105", 
 19 |     "scene-0106", "scene-0107", "scene-0108", "scene-0109", "scene-0110", 
 20 |     "scene-0120", "scene-0123", "scene-0124", "scene-0125", "scene-0126", 
 21 |     "scene-0127", "scene-0128", "scene-0129", "scene-0130", "scene-0131", 
 22 |     "scene-0132", "scene-0133", "scene-0134", "scene-0135", "scene-0138", 
 23 |     "scene-0149", "scene-0150", "scene-0151", "scene-0154", "scene-0155", 
 24 |     "scene-0157", "scene-0158", "scene-0159", "scene-0161", "scene-0162", 
 25 |     "scene-0163", "scene-0164", "scene-0165", "scene-0166", "scene-0167", 
 26 |     "scene-0168", "scene-0170", "scene-0171", "scene-0172", "scene-0173", 
 27 |     "scene-0174", "scene-0175", "scene-0176", "scene-0177", "scene-0178", 
 28 |     "scene-0179", "scene-0180", "scene-0181", "scene-0182", "scene-0183", 
 29 |     "scene-0185", "scene-0187", "scene-0188", "scene-0190", "scene-0191", 
 30 |     "scene-0192", "scene-0193", "scene-0194", "scene-0195", "scene-0196", 
 31 |     "scene-0199", "scene-0200", "scene-0202", "scene-0203", "scene-0204", 
 32 |     "scene-0206", "scene-0207", "scene-0208", "scene-0209", "scene-0210", 
 33 |     "scene-0211", "scene-0212", "scene-0213", "scene-0214", "scene-0218", 
 34 |     "scene-0219", "scene-0220", "scene-0221", "scene-0222", "scene-0224", 
 35 |     "scene-0225", "scene-0226", "scene-0227", "scene-0228", "scene-0229", 
 36 |     "scene-0230", "scene-0231", "scene-0232", "scene-0233", "scene-0234", 
 37 |     "scene-0235", "scene-0236", "scene-0237", "scene-0238", "scene-0239", 
 38 |     "scene-0240", "scene-0241", "scene-0242", "scene-0243", "scene-0244", 
 39 |     "scene-0245", "scene-0246", "scene-0247", "scene-0248", "scene-0249", 
 40 |     "scene-0250", "scene-0251", "scene-0252", "scene-0253", "scene-0254", 
 41 |     "scene-0255", "scene-0256", "scene-0257", "scene-0258", "scene-0259", 
 42 |     "scene-0260", "scene-0261", "scene-0262", "scene-0263", "scene-0264", 
 43 |     "scene-0268", "scene-0270", "scene-0271", "scene-0272", "scene-0273", 
 44 |     "scene-0274", "scene-0275", "scene-0276", "scene-0277", "scene-0278", 
 45 |     "scene-0283", "scene-0284", "scene-0285", "scene-0286", "scene-0287", 
 46 |     "scene-0288", "scene-0289", "scene-0290", "scene-0291", "scene-0292", 
 47 |     "scene-0293", "scene-0294", "scene-0295", "scene-0296", "scene-0297", 
 48 |     "scene-0298", "scene-0299", "scene-0300", "scene-0301", "scene-0302", 
 49 |     "scene-0303", "scene-0304", "scene-0305", "scene-0306", "scene-0315", 
 50 |     "scene-0316", "scene-0317", "scene-0318", "scene-0321", "scene-0323", 
 51 |     "scene-0324", "scene-0328", "scene-0329", "scene-0330", "scene-0331", 
 52 |     "scene-0332", "scene-0344", "scene-0345", "scene-0346", "scene-0349", 
 53 |     "scene-0350", "scene-0351", "scene-0352", "scene-0353", "scene-0354", 
 54 |     "scene-0355", "scene-0356", "scene-0357", "scene-0358", "scene-0359", 
 55 |     "scene-0360", "scene-0361", "scene-0362", "scene-0363", "scene-0364", 
 56 |     "scene-0365", "scene-0367", "scene-0370", "scene-0371", "scene-0372", 
 57 |     "scene-0373", "scene-0374", "scene-0375", "scene-0376", "scene-0377", 
 58 |     "scene-0379", "scene-0380", "scene-0381", "scene-0382", "scene-0383", 
 59 |     "scene-0384", "scene-0385", "scene-0386", "scene-0388", "scene-0399", 
 60 |     "scene-0400", "scene-0401", "scene-0402", "scene-0403", "scene-0405", 
 61 |     "scene-0406", "scene-0407", "scene-0408", "scene-0420", "scene-0421", 
 62 |     "scene-0422", "scene-0423", "scene-0424", "scene-0425", "scene-0426", 
 63 |     "scene-0427", "scene-0428", "scene-0429", "scene-0430", "scene-0431", 
 64 |     "scene-0432", "scene-0433", "scene-0434", "scene-0435", "scene-0436", 
 65 |     "scene-0437", "scene-0438", "scene-0439", "scene-0440", "scene-0441", 
 66 |     "scene-0442", "scene-0443", "scene-0444", "scene-0445", "scene-0446", 
 67 |     "scene-0447", "scene-0448", "scene-0449", "scene-0450", "scene-0451", 
 68 |     "scene-0452", "scene-0453", "scene-0454", "scene-0455", "scene-0456", 
 69 |     "scene-0457", "scene-0458", "scene-0459", "scene-0461", "scene-0462", 
 70 |     "scene-0463", "scene-0464", "scene-0465", "scene-0467", "scene-0468", 
 71 |     "scene-0469", "scene-0471", "scene-0472", "scene-0474", "scene-0475", 
 72 |     "scene-0476", "scene-0477", "scene-0478", "scene-0479", "scene-0480", 
 73 |     "scene-0499", "scene-0500", "scene-0501", "scene-0502", "scene-0504", 
 74 |     "scene-0505", "scene-0506", "scene-0507", "scene-0508", "scene-0509", 
 75 |     "scene-0510", "scene-0511", "scene-0512", "scene-0513", "scene-0514", 
 76 |     "scene-0515", "scene-0517", "scene-0518", "scene-0519", "scene-0520", 
 77 |     "scene-0521", "scene-0522", "scene-0523", "scene-0524", "scene-0552", 
 78 |     "scene-0553", "scene-0554", "scene-0555", "scene-0559", "scene-0560", 
 79 |     "scene-0561", "scene-0562", "scene-0563", "scene-0564", "scene-0565", 
 80 |     "scene-0584", "scene-0585", "scene-0586", "scene-0587", "scene-0588", 
 81 |     "scene-0589", "scene-0590", "scene-0591", "scene-0592", "scene-0593", 
 82 |     "scene-0594", "scene-0595", "scene-0596", "scene-0597", "scene-0598", 
 83 |     "scene-0599", "scene-0600", "scene-0625", "scene-0626", "scene-0627", 
 84 |     "scene-0629", "scene-0630", "scene-0632", "scene-0633", "scene-0634", 
 85 |     "scene-0635", "scene-0636", "scene-0637", "scene-0638", "scene-0639", 
 86 |     "scene-0640", "scene-0652", "scene-0653", "scene-0654", "scene-0655", 
 87 |     "scene-0656", "scene-0657", "scene-0658", "scene-0659", "scene-0660", 
 88 |     "scene-0661", "scene-0662", "scene-0663", "scene-0664", "scene-0665", 
 89 |     "scene-0666", "scene-0667", "scene-0668", "scene-0669", "scene-0670", 
 90 |     "scene-0671", "scene-0672", "scene-0673", "scene-0674", "scene-0675", 
 91 |     "scene-0676", "scene-0677", "scene-0678", "scene-0679", "scene-0681", 
 92 |     "scene-0683", "scene-0684", "scene-0685", "scene-0686", "scene-0687", 
 93 |     "scene-0688", "scene-0689", "scene-0695", "scene-0696", "scene-0697", 
 94 |     "scene-0698", "scene-0700", "scene-0701", "scene-0703", "scene-0704", 
 95 |     "scene-0705", "scene-0706", "scene-0707", "scene-0708", "scene-0709", 
 96 |     "scene-0710", "scene-0711", "scene-0712", "scene-0713", "scene-0714", 
 97 |     "scene-0715", "scene-0716", "scene-0717", "scene-0718", "scene-0719", 
 98 |     "scene-0726", "scene-0727", "scene-0728", "scene-0730", "scene-0731", 
 99 |     "scene-0733", "scene-0734", "scene-0735", "scene-0736", "scene-0737", 
100 |     "scene-0738", "scene-0780", "scene-0781", "scene-0782", "scene-0783", 
101 |     "scene-0784", "scene-0786", "scene-0787", "scene-0789", "scene-0790", 
102 |     "scene-0791", "scene-0792", "scene-0802", "scene-0806", "scene-0808", 
103 |     "scene-0809", "scene-0810", "scene-0811", "scene-0812", "scene-0813", 
104 |     "scene-0815", "scene-0816", "scene-0817", "scene-0819", "scene-0820", 
105 |     "scene-0821", "scene-0822", "scene-0847", "scene-0848", "scene-0849", 
106 |     "scene-0850", "scene-0851", "scene-0852", "scene-0853", "scene-0854", 
107 |     "scene-0855", "scene-0856", "scene-0858", "scene-0860", "scene-0861", 
108 |     "scene-0862", "scene-0863", "scene-0864", "scene-0865", "scene-0866", 
109 |     "scene-0868", "scene-0869", "scene-0870", "scene-0871", "scene-0872", 
110 |     "scene-0873", "scene-0875", "scene-0876", "scene-0877", "scene-0878", 
111 |     "scene-0880", "scene-0882", "scene-0883", "scene-0884", "scene-0885", 
112 |     "scene-0886", "scene-0887", "scene-0888", "scene-0889", "scene-0890", 
113 |     "scene-0891", "scene-0892", "scene-0893", "scene-0894", "scene-0895", 
114 |     "scene-0896", "scene-0897", "scene-0898", "scene-0899", "scene-0900", 
115 |     "scene-0901", "scene-0902", "scene-0903", "scene-0904", "scene-0905", 
116 |     "scene-0906", "scene-0907", "scene-0908", "scene-0909", "scene-0916", 
117 |     "scene-0917", "scene-0921", "scene-0922", "scene-0923", "scene-0925", 
118 |     "scene-0926", "scene-0927", "scene-0928", "scene-0929", "scene-0930", 
119 |     "scene-0931", "scene-0945", "scene-0947", "scene-0949", "scene-0952", 
120 |     "scene-0953", "scene-0955", "scene-0956", "scene-0957", "scene-0958", 
121 |     "scene-0959", "scene-0960", "scene-0961", "scene-0966", "scene-0967", 
122 |     "scene-0968", "scene-0969", "scene-0971", "scene-0972", "scene-0975", 
123 |     "scene-0976", "scene-0977", "scene-0978", "scene-0979", "scene-0980", 
124 |     "scene-0981", "scene-0982", "scene-0983", "scene-0984", "scene-0988", 
125 |     "scene-0989", "scene-0990", "scene-0991", "scene-0992", "scene-0994", 
126 |     "scene-0995", "scene-0996", "scene-0997", "scene-0998", "scene-0999", 
127 |     "scene-1000", "scene-1001", "scene-1004", "scene-1005", "scene-1006", 
128 |     "scene-1007", "scene-1008", "scene-1009", "scene-1010", "scene-1011", 
129 |     "scene-1012", "scene-1013", "scene-1014", "scene-1015", "scene-1019", 
130 |     "scene-1020", "scene-1021", "scene-1022", "scene-1023", "scene-1024", 
131 |     "scene-1025", "scene-1044", "scene-1045", "scene-1046", "scene-1047", 
132 |     "scene-1048", "scene-1049", "scene-1050", "scene-1051", "scene-1052", 
133 |     "scene-1053", "scene-1054", "scene-1064", "scene-1065", "scene-1066", 
134 |     "scene-1067", "scene-1068", "scene-1069", "scene-1070", "scene-1071", 
135 |     "scene-1072", "scene-1073", "scene-1074", "scene-1075", "scene-1076", 
136 |     "scene-1077", "scene-1078", "scene-1079", "scene-1080", "scene-1081", 
137 |     "scene-1082", "scene-1083", "scene-1084", "scene-1085", "scene-1086", 
138 |     "scene-1087", "scene-1088", "scene-1089", "scene-1090", "scene-1091", 
139 |     "scene-1092", "scene-1093", "scene-1094", "scene-1095", "scene-1096", 
140 |     "scene-1097", "scene-1098", "scene-1099", "scene-1100", "scene-1101", 
141 |     "scene-1102", "scene-1104", "scene-1105", "scene-1106", "scene-1107", 
142 |     "scene-1108", "scene-1109", "scene-1110"]
143 | 
144 | VAL_SCENES = [
145 |     "scene-0001", "scene-0010", "scene-0011", "scene-0020", "scene-0038", 
146 |     "scene-0041", "scene-0053", "scene-0054", "scene-0121", "scene-0122", 
147 |     "scene-0139", "scene-0152", "scene-0160", "scene-0184", "scene-0269", 
148 |     "scene-0347", "scene-0348", "scene-0366", "scene-0368", "scene-0369", 
149 |     "scene-0378", "scene-0389", "scene-0390", "scene-0391", "scene-0392", 
150 |     "scene-0393", "scene-0394", "scene-0395", "scene-0396", "scene-0397", 
151 |     "scene-0398", "scene-0411", "scene-0412", "scene-0413", "scene-0414", 
152 |     "scene-0415", "scene-0416", "scene-0417", "scene-0418", "scene-0419", 
153 |     "scene-0525", "scene-0526", "scene-0527", "scene-0528", "scene-0529", 
154 |     "scene-0530", "scene-0531", "scene-0532", "scene-0533", "scene-0534", 
155 |     "scene-0535", "scene-0536", "scene-0537", "scene-0538", "scene-0539", 
156 |     "scene-0541", "scene-0542", "scene-0543", "scene-0544", "scene-0545", 
157 |     "scene-0546", "scene-0556", "scene-0557", "scene-0558", "scene-0566", 
158 |     "scene-0568", "scene-0570", "scene-0571", "scene-0572", "scene-0573", 
159 |     "scene-0574", "scene-0575", "scene-0576", "scene-0577", "scene-0578", 
160 |     "scene-0580", "scene-0582", "scene-0583", "scene-0642", "scene-0643", 
161 |     "scene-0644", "scene-0645", "scene-0646", "scene-0647", "scene-0648", 
162 |     "scene-0649", "scene-0650", "scene-0651", "scene-0739", "scene-0740", 
163 |     "scene-0741", "scene-0744", "scene-0746", "scene-0747", "scene-0749", 
164 |     "scene-0750", "scene-0751", "scene-0752", "scene-0757", "scene-0758", 
165 |     "scene-0759", "scene-0760", "scene-0761", "scene-0762", "scene-0763", 
166 |     "scene-0764", "scene-0765", "scene-0767", "scene-0768", "scene-0769", 
167 |     "scene-0770", "scene-0771", "scene-0775", "scene-0777", "scene-0778", 
168 |     "scene-0794", "scene-0795", "scene-0796", "scene-0797", "scene-0798", 
169 |     "scene-0799", "scene-0800", "scene-0803", "scene-0804", "scene-0911", 
170 |     "scene-0912", "scene-0913", "scene-0914", "scene-0915", "scene-0919", 
171 |     "scene-0920", "scene-0924", "scene-0962", "scene-0963", "scene-1002", 
172 |     "scene-1003", "scene-1016", "scene-1017", "scene-1018", "scene-1055", 
173 |     "scene-1056", "scene-1057", "scene-1058", "scene-1059", "scene-1060", 
174 |     "scene-1061", "scene-1062", "scene-1063"]
175 | 
176 | 
177 | CALIBRATION_SCENES = [
178 |     "scene-0852", "scene-0429", "scene-0956", "scene-0194", "scene-0811", 
179 |     "scene-1110", "scene-1107", "scene-0294", "scene-0900", "scene-0596", 
180 |     "scene-0296", "scene-0885", "scene-0866", "scene-0105", "scene-0782", 
181 |     "scene-0191", "scene-0876", "scene-0133", "scene-0231", "scene-0847", 
182 |     "scene-0363", "scene-0026", "scene-0791", "scene-0909", "scene-0002", 
183 |     "scene-0283", "scene-0007", "scene-0251", "scene-1100", "scene-0668", 
184 |     "scene-0584", "scene-0287", "scene-0260", "scene-0171", "scene-0789", 
185 |     "scene-0108", "scene-0190", "scene-0206", "scene-0635", "scene-0815", 
186 |     "scene-0058", "scene-0710", "scene-0302", "scene-0639", "scene-0166", 
187 |     "scene-0094", "scene-0735", "scene-0321", "scene-1091", "scene-0344"
188 | ]


--------------------------------------------------------------------------------
/src/data/nuscenes/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | from shapely import geometry, affinity
  4 | from pyquaternion import Quaternion
  5 | 
  6 | from nuscenes.eval.detection.utils import category_to_detection_name
  7 | from nuscenes.eval.detection.constants import DETECTION_NAMES
  8 | from nuscenes.utils.data_classes import LidarPointCloud
  9 | 
 10 | from ..utils import transform_polygon, render_polygon, transform
 11 | 
 12 | CAMERA_NAMES = ['CAM_FRONT', 'CAM_FRONT_LEFT', 'CAM_FRONT_RIGHT', 
 13 |                 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT', 'CAM_BACK']
 14 | 
 15 | NUSCENES_CLASS_NAMES = [
 16 |     'drivable_area', 'ped_crossing', 'walkway', 'carpark', 'car', 'truck', 
 17 |     'bus', 'trailer', 'construction_vehicle', 'pedestrian', 'motorcycle', 
 18 |     'bicycle', 'traffic_cone', 'barrier'
 19 | ]
 20 | 
 21 | STATIC_CLASSES = ['drivable_area', 'ped_crossing', 'walkway', 'carpark_area']
 22 | 
 23 | LOCATIONS = ['boston-seaport', 'singapore-onenorth', 'singapore-queenstown',
 24 |              'singapore-hollandvillage']
 25 | 
 26 | 
 27 | def iterate_samples(nuscenes, start_token):
 28 |     sample_token = start_token
 29 |     while sample_token != '':
 30 |         sample = nuscenes.get('sample', sample_token)
 31 |         yield sample
 32 |         sample_token = sample['next']
 33 |     
 34 | 
 35 | def get_map_masks(nuscenes, map_data, sample_data, extents, resolution):
 36 | 
 37 |     # Render each layer sequentially
 38 |     layers = [get_layer_mask(nuscenes, polys, sample_data, extents, 
 39 |               resolution) for layer, polys in map_data.items()]
 40 | 
 41 |     return np.stack(layers, axis=0)
 42 | 
 43 | 
 44 | def get_layer_mask(nuscenes, polygons, sample_data, extents, resolution):
 45 | 
 46 |     # Get the 2D affine transform from bev coords to map coords
 47 |     tfm = get_sensor_transform(nuscenes, sample_data)[[0, 1, 3]][:, [0, 2, 3]]
 48 |     inv_tfm = np.linalg.inv(tfm)
 49 | 
 50 |     # Create a patch representing the birds-eye-view region in map coordinates
 51 |     map_patch = geometry.box(*extents)
 52 |     map_patch = transform_polygon(map_patch, tfm)
 53 | 
 54 |     # Initialise the map mask
 55 |     x1, z1, x2, z2 = extents
 56 |     mask = np.zeros((int((z2 - z1) / resolution), int((x2 - x1) / resolution)),
 57 |                     dtype=np.uint8)
 58 | 
 59 |     # Find all polygons which intersect with the area of interest
 60 |     for polygon in polygons.query(map_patch):
 61 | 
 62 |         polygon = polygon.intersection(map_patch)
 63 |         
 64 |         # Transform into map coordinates
 65 |         polygon = transform_polygon(polygon, inv_tfm)
 66 | 
 67 |         # Render the polygon to the mask
 68 |         render_shapely_polygon(mask, polygon, extents, resolution)
 69 |     
 70 |     return mask.astype(np.bool)
 71 | 
 72 | 
 73 | 
 74 | 
 75 | def get_object_masks(nuscenes, sample_data, extents, resolution):
 76 | 
 77 |     # Initialize object masks
 78 |     nclass = len(DETECTION_NAMES) + 1
 79 |     grid_width = int((extents[2] - extents[0]) / resolution)
 80 |     grid_height = int((extents[3] - extents[1]) / resolution)
 81 |     masks = np.zeros((nclass, grid_height, grid_width), dtype=np.uint8)
 82 | 
 83 |     # Get the 2D affine transform from bev coords to map coords
 84 |     tfm = get_sensor_transform(nuscenes, sample_data)[[0, 1, 3]][:, [0, 2, 3]]
 85 |     inv_tfm = np.linalg.inv(tfm)
 86 | 
 87 |     for box in nuscenes.get_boxes(sample_data['token']):
 88 | 
 89 |         # Get the index of the class
 90 |         det_name = category_to_detection_name(box.name)
 91 |         if det_name not in DETECTION_NAMES:
 92 |             class_id = -1
 93 |         else:
 94 |             class_id = DETECTION_NAMES.index(det_name)
 95 |         
 96 |         # Get bounding box coordinates in the grid coordinate frame
 97 |         bbox = box.bottom_corners()[:2]
 98 |         local_bbox = np.dot(inv_tfm[:2, :2], bbox).T + inv_tfm[:2, 2]
 99 | 
100 |         # Render the rotated bounding box to the mask
101 |         render_polygon(masks[class_id], local_bbox, extents, resolution)
102 |     
103 |     return masks.astype(np.bool)
104 | 
105 | 
106 | def get_sensor_transform(nuscenes, sample_data):
107 | 
108 |     # Load sensor transform data
109 |     sensor = nuscenes.get(
110 |         'calibrated_sensor', sample_data['calibrated_sensor_token'])
111 |     sensor_tfm = make_transform_matrix(sensor)
112 | 
113 |     # Load ego pose data
114 |     pose = nuscenes.get('ego_pose', sample_data['ego_pose_token'])
115 |     pose_tfm = make_transform_matrix(pose)
116 | 
117 |     return np.dot(pose_tfm, sensor_tfm)
118 | 
119 | 
120 | def load_point_cloud(nuscenes, sample_data):
121 | 
122 |     # Load point cloud
123 |     lidar_path = os.path.join(nuscenes.dataroot, sample_data['filename'])
124 |     pcl = LidarPointCloud.from_file(lidar_path)
125 |     return pcl.points[:3, :].T
126 | 
127 | 
128 | def make_transform_matrix(record):
129 |     """
130 |     Create a 4x4 transform matrix from a calibrated_sensor or ego_pose record
131 |     """
132 |     transform = np.eye(4)
133 |     transform[:3, :3] = Quaternion(record['rotation']).rotation_matrix
134 |     transform[:3, 3] = np.array(record['translation'])
135 |     return transform
136 | 
137 | 
138 | def render_shapely_polygon(mask, polygon, extents, resolution):
139 | 
140 |     if polygon.geom_type == 'Polygon':
141 | 
142 |         # Render exteriors
143 |         render_polygon(mask, polygon.exterior.coords, extents, resolution, 1)
144 | 
145 |         # Render interiors
146 |         for hole in polygon.interiors:
147 |             render_polygon(mask, hole.coords, extents, resolution, 0)
148 |     
149 |     # Handle the case of compound shapes
150 |     else:
151 |         for poly in polygon:
152 |             render_shapely_polygon(mask, poly, extents, resolution)
153 | 
154 | 
155 | 
156 | 
157 | 


--------------------------------------------------------------------------------
/src/data/utils.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | import torch
  4 | from shapely import affinity
  5 | 
  6 | def decode_binary_labels(labels, nclass):
  7 |     bits = torch.pow(2, torch.arange(nclass))
  8 |     return (labels & bits.view(-1, 1, 1)) > 0
  9 | 
 10 | 
 11 | def encode_binary_labels(masks):
 12 |     bits = np.power(2, np.arange(len(masks), dtype=np.int32))
 13 |     return (masks.astype(np.int32) * bits.reshape(-1, 1, 1)).sum(0)
 14 | 
 15 | 
 16 | def transform(matrix, vectors):
 17 |     vectors = np.dot(matrix[:-1, :-1], vectors.T)
 18 |     vectors = vectors.T + matrix[:-1, -1]
 19 |     return vectors
 20 | 
 21 | 
 22 | def transform_polygon(polygon, affine):
 23 |     """
 24 |     Transform a 2D polygon
 25 |     """
 26 |     a, b, tx, c, d, ty = affine.flatten()[:6]
 27 |     return affinity.affine_transform(polygon, [a, b, c, d, tx, ty])
 28 | 
 29 | 
 30 | def render_polygon(mask, polygon, extents, resolution, value=1):
 31 |     if len(polygon) == 0:
 32 |         return
 33 |     polygon = (polygon - np.array(extents[:2])) / resolution
 34 |     polygon = np.ascontiguousarray(polygon).round().astype(np.int32)
 35 |     cv2.fillConvexPoly(mask, polygon, value)
 36 | 
 37 | 
 38 | def get_visible_mask(instrinsics, image_width, extents, resolution):
 39 | 
 40 |     # Get calibration parameters
 41 |     fu, cu = instrinsics[0, 0], instrinsics[0, 2]
 42 | 
 43 |     # Construct a grid of image coordinates
 44 |     x1, z1, x2, z2 = extents
 45 |     x, z = np.arange(x1, x2, resolution), np.arange(z1, z2, resolution)
 46 |     ucoords = x / z[:, None] * fu + cu
 47 | 
 48 |     # Return all points which lie within the camera bounds
 49 |     return (ucoords >= 0) & (ucoords < image_width)
 50 | 
 51 | 
 52 | def get_occlusion_mask(points, extents, resolution):
 53 | 
 54 |     x1, z1, x2, z2 = extents
 55 | 
 56 |     # A 'ray' is defined by the ratio between x and z coordinates
 57 |     ray_width = resolution / z2
 58 |     ray_offset = x1 / ray_width
 59 |     max_rays = int((x2 - x1) / ray_width)
 60 | 
 61 |     # Group LiDAR points into bins
 62 |     rayid = np.round(points[:, 0] / points[:, 2] / ray_width - ray_offset)
 63 |     depth = points[:, 2]
 64 | 
 65 |     # Ignore rays which do not correspond to any grid cells in the BEV
 66 |     valid = (rayid > 0) & (rayid < max_rays) & (depth > 0)
 67 |     rayid = rayid[valid]
 68 |     depth = depth[valid]
 69 | 
 70 |     # Find the LiDAR point with maximum depth within each bin
 71 |     max_depth = np.zeros((max_rays,))
 72 |     np.maximum.at(max_depth, rayid.astype(np.int32), depth)
 73 | 
 74 |     # For each bev grid point, sample the max depth along the corresponding ray
 75 |     x = np.arange(x1, x2, resolution)
 76 |     z = np.arange(z1, z2, resolution)[:, None]
 77 |     grid_rayid = np.round(x / z / ray_width - ray_offset).astype(np.int32)
 78 |     grid_max_depth = max_depth[grid_rayid]
 79 | 
 80 |     # A grid position is considered occluded if the there are no LiDAR points
 81 |     # passing through it
 82 |     occluded = grid_max_depth < z
 83 |     return occluded
 84 | 
 85 | 
 86 | 
 87 | 
 88 | 
 89 |     
 90 | 
 91 | 
 92 | 
 93 | 
 94 |     
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/src/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tom-roddick/mono-semantic-maps/f6b0f52857a4e622ae42bf24faaf9681cc089f76/src/models/__init__.py


--------------------------------------------------------------------------------
/src/models/criterion.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from ..nn.losses import balanced_binary_cross_entropy, uncertainty_loss, \
  4 |     kl_divergence_loss, focal_loss, prior_offset_loss, prior_uncertainty_loss
  5 | 
  6 | # class OccupancyCriterion(nn.Module):
  7 | 
  8 | #     def __init__(self, xent_weight=1., uncert_weight=0., class_weights=None):
  9 | #         super().__init__()
 10 | 
 11 | #         self.xent_weight = xent_weight
 12 | #         self.uncert_weight = uncert_weight
 13 | 
 14 | #         if class_weights is None:
 15 | #             self.class_weights = torch.ones(1)
 16 | #         else:
 17 | #             self.class_weights = torch.tensor(class_weights)
 18 |     
 19 | 
 20 | #     def forward(self, logits, labels, mask, *args):
 21 | 
 22 | #         # Compute binary cross entropy loss
 23 | #         self.class_weights = self.class_weights.to(logits)
 24 | #         bce_loss = balanced_binary_cross_entropy(
 25 | #             logits, labels, mask, self.class_weights)
 26 |         
 27 | #         # Compute uncertainty loss for unknown image regions
 28 | #         uncert_loss = uncertainty_loss(logits, mask)
 29 | 
 30 | #         return bce_loss * self.xent_weight + uncert_loss * self.uncert_weight
 31 | 
 32 | 
 33 | class OccupancyCriterion(nn.Module):
 34 | 
 35 |     def __init__(self, priors, xent_weight=1., uncert_weight=0., 
 36 |                  weight_mode='sqrt_inverse'):
 37 |         super().__init__()
 38 | 
 39 |         self.xent_weight = xent_weight
 40 |         self.uncert_weight = uncert_weight
 41 | 
 42 |         self.priors = torch.tensor(priors)
 43 | 
 44 |         if weight_mode == 'inverse':
 45 |             self.class_weights = 1 / self.priors
 46 |         elif weight_mode == 'sqrt_inverse':
 47 |             self.class_weights = torch.sqrt(1 / self.priors)
 48 |         elif weight_mode == 'equal':
 49 |             self.class_weights = torch.ones_like(self.priors)
 50 |         else:
 51 |             raise ValueError('Unknown weight mode option: ' + weight_mode)
 52 |     
 53 | 
 54 |     def forward(self, logits, labels, mask, *args):
 55 | 
 56 |         # Compute binary cross entropy loss
 57 |         self.class_weights = self.class_weights.to(logits)
 58 |         bce_loss = balanced_binary_cross_entropy(
 59 |             logits, labels, mask, self.class_weights)
 60 |         
 61 |         # Compute uncertainty loss for unknown image regions
 62 |         self.priors = self.priors.to(logits)
 63 |         uncert_loss = prior_uncertainty_loss(logits, mask, self.priors)
 64 | 
 65 |         return bce_loss * self.xent_weight + uncert_loss * self.uncert_weight
 66 | 
 67 | 
 68 | 
 69 | class FocalLossCriterion(nn.Module):
 70 | 
 71 |     def __init__(self, alpha, gamma):
 72 |         super().__init__()
 73 |         self.alpha = alpha
 74 |         self.gamma = gamma
 75 |     
 76 |     def forward(self, logits, labels, mask, *args):
 77 |         return focal_loss(logits, labels, mask, self.alpha, self.gamma)
 78 | 
 79 | 
 80 | class PriorOffsetCriterion(nn.Module):
 81 | 
 82 |     def __init__(self, priors):
 83 |         super().__init__()
 84 |         self.priors = priors
 85 |     
 86 |     def forward(self, logits, labels, mask, *args):
 87 |         return prior_offset_loss(logits, labels, mask, self.priors)
 88 | 
 89 | 
 90 | 
 91 | 
 92 | class VaeOccupancyCriterion(OccupancyCriterion):
 93 | 
 94 |     def __init__(self, priors, xent_weight=0.9, uncert_weight=0., weight_mode='sqrt_inverse',  kld_weight=0.1):
 95 |         super().__init__(priors, xent_weight, uncert_weight, weight_mode)
 96 | 
 97 |         self.kld_weight = kld_weight
 98 |     
 99 |     def forward(self, logits, labels, mask, mu, logvar):
100 | 
101 |         kld_loss = kl_divergence_loss(mu, logvar)
102 |         occ_loss = super().forward(logits, labels, mask)
103 |         return occ_loss + kld_loss * self.kld_weight
104 | 


--------------------------------------------------------------------------------
/src/models/model_factory.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from operator import mul
  3 | from functools import reduce
  4 | import torch.nn as nn
  5 | 
  6 | from .pyramid import PyramidOccupancyNetwork
  7 | from .ved import VariationalEncoderDecoder
  8 | from .vpn import VPNModel
  9 | from .criterion import OccupancyCriterion, VaeOccupancyCriterion, \
 10 |     FocalLossCriterion, PriorOffsetCriterion
 11 | 
 12 | from ..nn.fpn import FPN50
 13 | from ..nn.topdown import TopdownNetwork
 14 | from ..nn.pyramid import TransformerPyramid
 15 | from ..nn.classifier import LinearClassifier, BayesianClassifier
 16 | 
 17 | 
 18 | 
 19 | def build_model(model_name, config):
 20 | 
 21 |     if model_name == 'pyramid':
 22 |         model = build_pyramid_occupancy_network(config)
 23 |     elif model_name == 'ved':
 24 |         model = build_variational_encoder_decoder(config)
 25 |     elif model_name == 'vpn':
 26 |         model = build_view_parsing_network(config)
 27 |     else:
 28 |         raise ValueError("Unknown model name '{}'".format(model_name))
 29 |     
 30 |     if len(config.gpus) > 1:
 31 |         model = nn.DataParallel(model.cuda(), config.gpus)
 32 |     elif len(config.gpus) == 1:
 33 |         model.cuda()
 34 |     
 35 |     return model
 36 | 
 37 | 
 38 | def build_criterion(model_name, config):
 39 | 
 40 |     if model_name == 'ved':
 41 |         criterion = VaeOccupancyCriterion(config.prior,
 42 |                                           config.xent_weight, 
 43 |                                           config.uncert_weight,
 44 |                                           config.weight_mode,
 45 |                                           config.kld_weight, 
 46 |                                           )
 47 |                                           
 48 |     elif config.loss_fn == 'focal':
 49 |         criterion = FocalLossCriterion(config.focal.alpha, config.focal.gamma)
 50 |     elif config.loss_fn == 'prior':
 51 |         criterion = PriorOffsetCriterion(config.prior)
 52 |     else:
 53 |         criterion = OccupancyCriterion(config.prior, config.xent_weight, 
 54 |                                        config.uncert_weight, config.weight_mode)
 55 |     
 56 |     if len(config.gpus) > 0:
 57 |         criterion.cuda()
 58 |     
 59 |     return criterion
 60 | 
 61 | 
 62 | 
 63 | def build_pyramid_occupancy_network(config):
 64 | 
 65 |     # Build frontend
 66 |     frontend = FPN50()
 67 | 
 68 |     # Build transformer pyramid
 69 |     tfm_resolution = config.map_resolution * reduce(mul, config.topdown.strides)
 70 |     transformer = TransformerPyramid(256, config.tfm_channels, tfm_resolution,
 71 |                                      config.map_extents, config.ymin, 
 72 |                                      config.ymax, config.focal_length)
 73 | 
 74 |     # Build topdown network
 75 |     topdown = TopdownNetwork(config.tfm_channels, config.topdown.channels,
 76 |                              config.topdown.layers, config.topdown.strides,
 77 |                              config.topdown.blocktype)
 78 |     
 79 |     # Build classifier
 80 |     if config.bayesian:
 81 |         classifier = BayesianClassifier(topdown.out_channels, config.num_class)
 82 |     else:
 83 |         classifier = LinearClassifier(topdown.out_channels, config.num_class)
 84 |     classifier.initialise(config.prior)
 85 |     
 86 |     # Assemble Pyramid Occupancy Network
 87 |     return PyramidOccupancyNetwork(frontend, transformer, topdown, classifier)
 88 | 
 89 | 
 90 | 
 91 | def build_variational_encoder_decoder(config):
 92 |     
 93 |     return VariationalEncoderDecoder(config.num_class, 
 94 |                                      config.ved.bottleneck_dim,
 95 |                                      config.map_extents,
 96 |                                      config.map_resolution)
 97 | 
 98 | 
 99 | def build_view_parsing_network(config):
100 | 
101 |     return VPNModel(1, config.num_class, config.vpn.output_size, 
102 |                     config.vpn.fc_dim, config.map_extents, 
103 |                     config.map_resolution)
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/src/models/pyramid.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | class PyramidOccupancyNetwork(nn.Module):
 8 | 
 9 | 
10 |     def __init__(self, frontend, transformer, topdown, classifier):
11 |         super().__init__()
12 | 
13 | 
14 |         self.frontend = frontend
15 |         self.transformer = transformer
16 |         self.topdown = topdown
17 |         self.classifier = classifier
18 |     
19 | 
20 |     def forward(self, image, calib, *args):
21 | 
22 |         # Extract multiscale feature maps
23 |         feature_maps = self.frontend(image)
24 | 
25 |         # Transform image features to birds-eye-view
26 |         bev_feats = self.transformer(feature_maps, calib)
27 | 
28 |         # Apply topdown network
29 |         td_feats = self.topdown(bev_feats)
30 | 
31 |         # Predict individual class log-probabilities
32 |         logits = self.classifier(td_feats)
33 |         return logits


--------------------------------------------------------------------------------
/src/models/ved.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This implementation of the model from the paper "Monocular Semantic Occupancy 
  3 | Grid Mapping with Convolutional Variational Encoder-Decoder Networks" is 
  4 | directly adapted from the code provided by the original authors at 
  5 | https://github.com/Chenyang-Lu/mono-semantic-occupancy (accessed 08/06/2020).
  6 | 
  7 | Modifications to the original code are identified in comments.
  8 | 
  9 | MIT License
 10 | 
 11 | Copyright (c) 2019 
 12 | 
 13 | Permission is hereby granted, free of charge, to any person obtaining a copy
 14 | of this software and associated documentation files (the "Software"), to deal
 15 | in the Software without restriction, including without limitation the rights
 16 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 17 | copies of the Software, and to permit persons to whom the Software is
 18 | furnished to do so, subject to the following conditions:
 19 | 
 20 | The above copyright notice and this permission notice shall be included in all
 21 | copies or substantial portions of the Software.
 22 | 
 23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 24 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 25 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 26 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 27 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 28 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 29 | SOFTWARE.
 30 | """
 31 | 
 32 | import numpy as np
 33 | import torch
 34 | import torch.nn as nn
 35 | import torch.nn.functional as F
 36 | import torchvision.models as models
 37 | 
 38 | from ..nn import losses
 39 | 
 40 | 
 41 | class VariationalEncoderDecoder(nn.Module):
 42 | 
 43 |     def __init__(self, num_class, bottleneck_dim, map_extents, map_resolution):
 44 |         
 45 |         super().__init__()
 46 |         self.model = VaeMapping(num_class, bottleneck_dim)
 47 |         self.output_size = (
 48 |             int((map_extents[3] - map_extents[1]) / map_resolution),
 49 |             int((map_extents[2] - map_extents[0]) / map_resolution),
 50 |         )
 51 | 
 52 |     
 53 |     def forward(self, image, *args):
 54 | 
 55 |         # Downsample input image so that it more closely matches
 56 |         # the input dimensions used in the original paper
 57 |         image = image[:, :, ::2, ::2]
 58 | 
 59 |         # Run model forwards
 60 |         logits, mu, logvar = self.model(image, self.output_size, self.training)
 61 | 
 62 |         return logits, mu, logvar
 63 |     
 64 | 
 65 | 
 66 | 
 67 | 
 68 | 
 69 | 
 70 | def get_upsampling_weight(in_channels, out_channels, kernel_size):
 71 |     """Make a 2D bilinear kernel suitable for upsampling"""
 72 |     factor = (kernel_size + 1) // 2
 73 |     if kernel_size % 2 == 1:
 74 |         center = factor - 1
 75 |     else:
 76 |         center = factor - 0.5
 77 |     og = np.ogrid[:kernel_size, :kernel_size]
 78 |     filt = (1 - abs(og[0] - center) / factor) * \
 79 |            (1 - abs(og[1] - center) / factor)
 80 |     weight = np.zeros((in_channels, out_channels, kernel_size, kernel_size),
 81 |                       dtype=np.float64)
 82 |     weight[range(in_channels), range(out_channels), :, :] = filt
 83 |     return torch.from_numpy(weight).float()
 84 | 
 85 | 
 86 | class upsample(nn.Module):
 87 | 
 88 |     def __init__(self, if_deconv, channels=None):
 89 |         super(upsample, self).__init__()
 90 |         if if_deconv:
 91 |             self.upsample = nn.ConvTranspose2d(
 92 |                 channels, channels, 4, stride=2, padding=1, bias=False)
 93 |         else:
 94 |             self.upsample = nn.Upsample(
 95 |                 scale_factor=2, mode='bilinear', align_corners=True)
 96 | 
 97 |     def forward(self, x):
 98 |         x = self.upsample(x)
 99 | 
100 |         return x
101 | 
102 | 
103 | class double_conv(nn.Module):
104 | 
105 |     def __init__(self, in_ch, out_ch):
106 |         super(double_conv, self).__init__()
107 | 
108 |         self.conv = nn.Sequential(
109 |             nn.Conv2d(in_ch, out_ch, 3, padding=1),
110 |             nn.BatchNorm2d(out_ch),
111 |             nn.ReLU(),
112 |             nn.Conv2d(out_ch, out_ch, 3, padding=1),
113 |             nn.BatchNorm2d(out_ch),
114 |             nn.ReLU()
115 |         )
116 | 
117 |     def forward(self, x):
118 |         x = self.conv(x)
119 |         return x
120 | 
121 | 
122 | class encoder_after_vgg(nn.Module):
123 | 
124 |     def __init__(self, bottleneck_dim=32):
125 |         super(encoder_after_vgg, self).__init__()
126 | 
127 |         self.conv = nn.Sequential(
128 |             nn.Conv2d(512, 256, 3, padding=1),
129 |             nn.BatchNorm2d(256),
130 |             nn.ReLU(),
131 |             nn.Conv2d(256, 128, 3, padding=1),
132 |             nn.BatchNorm2d(128),
133 |             nn.ReLU(),
134 |             nn.MaxPool2d(2)
135 |         )
136 | 
137 |         # MODIFIED: The original VED paper assumed fixed input dimensions of 
138 |         # 256x512, which leads to a bottleneck dimension of 8x4. Since our
139 |         # input size varies depending on dataset we have to specify the
140 |         # bottleneck dimension manually. 
141 |         self.mu_dec = nn.Linear(bottleneck_dim * 128, 512)
142 |         self.logvar_dec = nn.Linear(bottleneck_dim * 128, 512)
143 |         
144 | 
145 |     def forward(self, x):
146 |         x = self.conv(x)
147 |         x = x.flatten(1, 3)
148 |         mu = self.mu_dec(x)
149 |         logvar = self.logvar_dec(x)
150 | 
151 |         return mu, logvar
152 | 
153 | 
154 | class decoder_conv(nn.Module):
155 |     def __init__(self, num_class, if_deconv=True):
156 |         super(decoder_conv, self).__init__()
157 | 
158 |         self.up1 = upsample(if_deconv=if_deconv, channels=128)
159 |         self.conv1 = double_conv(128, 256)
160 |         self.up2 = upsample(if_deconv=if_deconv, channels=256)
161 |         self.conv2 = double_conv(256, 256)
162 |         self.up3 = upsample(if_deconv=if_deconv, channels=256)
163 |         self.conv3 = double_conv(256, 256)
164 |         self.up4 = upsample(if_deconv=if_deconv, channels=256)
165 |         self.conv4 = double_conv(256, 256)
166 |         self.up5 = upsample(if_deconv=if_deconv, channels=256)
167 |         self.conv5 = double_conv(256, 256)
168 | 
169 |         # MODIFIED: Add an additional upsampling layer
170 |         self.up6 = upsample(if_deconv=if_deconv, channels=256)
171 |         self.conv6 = double_conv(256, 256)
172 | 
173 |         self.conv_out = nn.Conv2d(256, num_class, 3, padding=1)
174 | 
175 |         self._initialize_weights()
176 | 
177 |     def _initialize_weights(self):
178 |         for m in self.modules():
179 |             if isinstance(m, nn.ConvTranspose2d):
180 |                 assert m.kernel_size[0] == m.kernel_size[1]
181 |                 initial_weight = get_upsampling_weight(
182 |                     m.in_channels, m.out_channels, m.kernel_size[0])
183 |                 m.weight.data.copy_(initial_weight)
184 | 
185 |     def forward(self, x, output_size):
186 |         x = x.view(-1, 128, 2, 2)
187 |         x = self.up1(x)
188 |         x = self.conv1(x)
189 | 
190 |         x = self.up2(x)
191 |         x = self.conv2(x)
192 | 
193 |         x = self.up3(x)
194 |         x = self.conv3(x)
195 | 
196 |         x = self.up4(x)
197 |         x = self.conv4(x)
198 | 
199 |         x = self.up5(x)
200 |         x = self.conv5(x)
201 | 
202 |         # MODIFIED: Add additional upsampling layer
203 |         x = self.up6(x)
204 |         x = self.conv6(x)
205 | 
206 |         # MODIFIED: Resample to match label dimensions
207 |         x = F.upsample(x, size=output_size, mode='bilinear')
208 | 
209 |         x = self.conv_out(x)
210 | 
211 |         return x
212 | 
213 | 
214 | class VaeMapping(nn.Module):
215 | 
216 |     def __init__(self, num_class, bottleneck_dim=32):
217 |         super(VaeMapping, self).__init__()
218 | 
219 |         self.vgg16 = models.vgg16_bn(pretrained=True)
220 |         self.vgg16_feature = nn.Sequential(*list(self.vgg16.features.children())[:])
221 |         self.encoder_afterv_vgg = encoder_after_vgg(bottleneck_dim)
222 |         self.decoder = decoder_conv(num_class, if_deconv=True)
223 | 
224 |     def reparameterize(self, is_training, mu, logvar):
225 |         if is_training:
226 |             std = torch.exp(0.5*logvar)
227 |             eps = torch.randn_like(std)
228 |             return eps.mul(std).add_(mu)
229 |         else:
230 |             return mu
231 | 
232 |     def forward(self, x, output_size, is_training=False, defined_mu=None):
233 | 
234 |         x = self.vgg16_feature(x)
235 |         mu, logvar = self.encoder_afterv_vgg(x)
236 |         z = self.reparameterize(is_training, mu, logvar)
237 |         if defined_mu is not None:
238 |             z = defined_mu
239 |         pred_map = self.decoder(z, output_size)
240 | 
241 |         return pred_map, mu, logvar
242 | 
243 | 
244 | def loss_function_map(pred_map, map, mu, logvar):
245 | 
246 |     # MODIFIED: move weights to same GPU as inputs
247 |     CE = F.cross_entropy(pred_map, map.view(-1, 64, 64), weight=
248 |         torch.Tensor([0.6225708,  2.53963754, 15.46416047, 0.52885405]).to(map), ignore_index=4)
249 |     KLD = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
250 | 
251 |     return 0.9*CE + 0.1*KLD, CE, KLD
252 | 
253 | 


--------------------------------------------------------------------------------
/src/models/vpn.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This implementation of the model from the paper "Cross-view Semantic 
  3 | Segmentation for Sensing Surroundings" is directly adapted from the code
  4 | provided by the original authors at
  5 | https://github.com/pbw-Berwin/View-Parsing-Network (accessed 08/06/2020)
  6 | 
  7 | """
  8 | 
  9 | 
 10 | # File   : models.py
 11 | # Author : Bowen Pan
 12 | # Email  : panbowen0607@gmail.com
 13 | # Date   : 09/18/2018
 14 | #
 15 | # Distributed under terms of the MIT license.
 16 | import os
 17 | import sys
 18 | import math
 19 | import torch
 20 | from torch import nn
 21 | from collections import OrderedDict
 22 | import torch.nn.functional as F
 23 | 
 24 | import numpy as np
 25 | from itertools import combinations
 26 | 
 27 | from torchvision.models import resnet
 28 | 
 29 | try:
 30 |     from urllib import urlretrieve
 31 | except ImportError:
 32 |     from urllib.request import urlretrieve
 33 | 
 34 | 
 35 | class TransformModule(nn.Module):
 36 |     def __init__(self, dim=(37, 60), num_view=8):
 37 |         super(TransformModule, self).__init__()
 38 |         self.num_view = num_view
 39 |         self.dim = dim
 40 |         self.mat_list = nn.ModuleList()
 41 | 
 42 |         # MODIFIED: dims need not be square
 43 |         for i in range(self.num_view):
 44 |             fc_transform = nn.Sequential(
 45 |                         nn.Linear(dim[0] * dim[1], dim[0] * dim[1]),
 46 |                         nn.ReLU(),
 47 |                         nn.Linear(dim[0] * dim[1], dim[0] * dim[1]),
 48 |                         nn.ReLU()
 49 |                     )
 50 |             self.mat_list += [fc_transform]
 51 | 
 52 |     def forward(self, x):
 53 |         # shape x: B, V, C, H, W
 54 |         x = x.view(list(x.size()[:3]) + [self.dim[0] * self.dim[1],])
 55 |         view_comb = self.mat_list[0](x[:, 0])
 56 |         for index in range(x.size(1))[1:]:
 57 |             view_comb += self.mat_list[index](x[:, index])
 58 |         view_comb = view_comb.view(list(view_comb.size()[:2]) + list(self.dim))
 59 |         return view_comb
 60 | 
 61 | 
 62 | class SumModule(nn.Module):
 63 |     def __init__(self):
 64 |         super(SumModule, self).__init__()
 65 | 
 66 |     def forward(self, x):
 67 |         # shape x: B, V, C, H, W
 68 |         x = torch.sum(x, dim=1, keepdim=False)
 69 |         return x
 70 | 
 71 | 
 72 | class VPNModel(nn.Module):
 73 |     def __init__(self, num_views, num_class, output_size, fc_dim, map_extents, 
 74 |                  map_resolution):
 75 | 
 76 |         super(VPNModel, self).__init__()
 77 |         self.num_views = num_views
 78 |         self.output_size = output_size
 79 | 
 80 |         self.seg_size = (
 81 |             int((map_extents[3] - map_extents[1]) / map_resolution),
 82 |             int((map_extents[2] - map_extents[0]) / map_resolution),
 83 |         )
 84 | 
 85 |         # MODIFIED: we fix the transform module, the encoder and decoder to be 
 86 |         # the ones described in the paper 
 87 |         self.encoder = resnet18(True)
 88 |         self.transform_module = TransformModule(dim=self.output_size, 
 89 |                                                 num_view=self.num_views)
 90 |         self.decoder = PPMBilinear(num_class, fc_dim, False)
 91 | 
 92 |     def forward(self, x, *args):
 93 |         B, N, C, H, W = x.view([-1, self.num_views, int(x.size()[1] / self.num_views)] \
 94 |                             + list(x.size()[2:])).size()
 95 | 
 96 |         x = x.view(B*N, C, H, W)
 97 |         x = self.encoder(x)[0]
 98 |         x = x.view([B, N] + list(x.size()[1:]))
 99 |         x = self.transform_module(x)
100 |         x = self.decoder([x], self.seg_size)
101 |         
102 |         return x
103 | 
104 | 
105 | 
106 | def conv3x3(in_planes, out_planes, stride=1):
107 |     "3x3 convolution with padding"
108 |     return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
109 |                      padding=1, bias=False)
110 | 
111 | class ResNet(nn.Module):
112 | 
113 |     def __init__(self, block, layers):
114 |         self.inplanes = 128
115 |         super(ResNet, self).__init__()
116 |         self.conv1 = conv3x3(3, 64, stride=2)
117 |         self.bn1 = nn.BatchNorm2d(64)
118 |         self.relu1 = nn.ReLU(inplace=True)
119 |         self.conv2 = conv3x3(64, 64)
120 |         self.bn2 = nn.BatchNorm2d(64)
121 |         self.relu2 = nn.ReLU(inplace=True)
122 |         self.conv3 = conv3x3(64, 128)
123 |         self.bn3 = nn.BatchNorm2d(128)
124 |         self.relu3 = nn.ReLU(inplace=True)
125 |         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
126 | 
127 |         self.layer1 = self._make_layer(block, 64, layers[0])
128 |         self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
129 |         self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
130 |         # self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
131 | 
132 | 
133 |         for m in self.modules():
134 |             if isinstance(m, nn.Conv2d):
135 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
136 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
137 |             elif isinstance(m, nn.BatchNorm2d):
138 |                 m.weight.data.fill_(1)
139 |                 m.bias.data.zero_()
140 | 
141 |     def _make_layer(self, block, planes, blocks, stride=1):
142 |         downsample = None
143 |         if stride != 1 or self.inplanes != planes * block.expansion:
144 |             downsample = nn.Sequential(
145 |                 nn.Conv2d(self.inplanes, planes * block.expansion,
146 |                           kernel_size=1, stride=stride, bias=False),
147 |                 nn.BatchNorm2d(planes * block.expansion),
148 |             )
149 | 
150 |         layers = []
151 |         layers.append(block(self.inplanes, planes, stride, downsample))
152 |         self.inplanes = planes * block.expansion
153 |         for _ in range(1, blocks):
154 |             layers.append(block(self.inplanes, planes))
155 | 
156 |         return nn.Sequential(*layers)
157 | 
158 |     def forward(self, x, return_feature_maps=False):
159 |         x = self.relu1(self.bn1(self.conv1(x)))
160 |         x = self.relu2(self.bn2(self.conv2(x)))
161 |         x = self.relu3(self.bn3(self.conv3(x)))
162 |         x = self.maxpool(x)
163 | 
164 |         conv_out = []
165 |         x = self.layer1(x); conv_out.append(x)
166 |         x = self.layer2(x); conv_out.append(x)
167 |         x = self.layer3(x); conv_out.append(x)
168 |         # x = self.layer4(x)
169 | 
170 |         if return_feature_maps:
171 |             return conv_out
172 |         return [x]
173 | 
174 | 
175 | def resnet18(pretrained=False, **kwargs):
176 |     model = ResNet(resnet.BasicBlock, [2, 2, 2, 2], **kwargs)
177 |     if pretrained:
178 |         weights = load_url('http://sceneparsing.csail.mit.edu/model/'\
179 |                            'pretrained_resnet/resnet18-imagenet.pth')
180 |         state_dict = model.state_dict()
181 |         for key, weight in state_dict.items():
182 |             weight.copy_(weights[key])
183 |         model.load_state_dict(state_dict)
184 |     return model
185 | 
186 | 
187 | 
188 | def load_url(url, model_dir='./pretrained', map_location=None):
189 |     if not os.path.exists(model_dir):
190 |         os.makedirs(model_dir)
191 |     filename = url.split('/')[-1]
192 |     cached_file = os.path.join(model_dir, filename)
193 |     if not os.path.exists(cached_file):
194 |         sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file))
195 |         urlretrieve(url, cached_file)
196 |     return torch.load(cached_file, map_location=map_location)
197 | 
198 | 
199 | 
200 | # pyramid pooling, bilinear upsample
201 | class PPMBilinear(nn.Module):
202 |     def __init__(self, num_class=150, fc_dim=4096,
203 |                  use_softmax=False, pool_scales=(1, 2, 3, 6)):
204 |         super(PPMBilinear, self).__init__()
205 |         self.use_softmax = use_softmax
206 | 
207 |         self.ppm = []
208 |         for scale in pool_scales:
209 |             self.ppm.append(nn.Sequential(
210 |                 nn.AdaptiveAvgPool2d(scale),
211 |                 nn.Conv2d(fc_dim, 512, kernel_size=1, bias=False),
212 |                 nn.BatchNorm2d(512),
213 |                 nn.ReLU(inplace=True)
214 |             ))
215 |         self.ppm = nn.ModuleList(self.ppm)
216 | 
217 |         self.conv_last = nn.Sequential(
218 |             nn.Conv2d(fc_dim+len(pool_scales)*512, 512,
219 |                       kernel_size=3, padding=1, bias=False),
220 |             nn.BatchNorm2d(512),
221 |             nn.ReLU(inplace=True),
222 |             nn.Dropout2d(0.1),
223 |             nn.Conv2d(512, num_class, kernel_size=1)
224 |         )
225 | 
226 |     def forward(self, conv_out, segSize=None, return_feat=False):
227 |         conv5 = conv_out[-1]
228 | 
229 |         input_size = conv5.size()
230 |         ppm_out = [conv5]
231 |         for pool_scale in self.ppm:
232 |             ppm_out.append(nn.functional.upsample(
233 |                 pool_scale(conv5),
234 |                 (input_size[2], input_size[3]),
235 |                 mode='bilinear', align_corners=False))
236 |         ppm_out = torch.cat(ppm_out, 1)
237 | 
238 |         x = self.conv_last(ppm_out)
239 |         feat = x
240 |         if segSize is not None:
241 |             x = nn.functional.upsample(
242 |                 x, size=segSize, mode='bilinear', align_corners=False)
243 |         if self.use_softmax:  # is True during inference
244 |             x = nn.functional.softmax(x, dim=1)
245 |         
246 |         # MODIFIED: we use BCE loss, so do not convert to log-softmax
247 |         if return_feat:
248 |             return x, feat
249 |         return x


--------------------------------------------------------------------------------
/src/nn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tom-roddick/mono-semantic-maps/f6b0f52857a4e622ae42bf24faaf9681cc089f76/src/nn/__init__.py


--------------------------------------------------------------------------------
/src/nn/classifier.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | class LinearClassifier(nn.Conv2d):
 7 | 
 8 |     def __init__(self, in_channels, num_class):
 9 |         super().__init__(in_channels, num_class, 1)
10 |     
11 |     def initialise(self, prior):
12 |         prior = torch.tensor(prior)
13 |         self.weight.data.zero_()
14 |         self.bias.data.copy_(torch.log(prior / (1 - prior)))
15 |     
16 | 
17 | 
18 | class BayesianClassifier(nn.Module):
19 | 
20 |     def __init__(self, in_channels, num_class, num_samples=40):
21 |         super().__init__()
22 |         self.conv = nn.Conv2d(in_channels, num_class, 1)
23 |         self.num_samples = num_samples
24 |     
25 |     def initialise(self, prior):
26 |         prior = torch.tensor(prior)
27 |         self.conv.weight.data.zero_()
28 |         self.conv.bias.data.copy_(torch.log(prior / (1 - prior)))
29 |     
30 |     def forward(self, features):
31 | 
32 |         if self.training:
33 |             # At training time, apply dropout once
34 |             features = F.dropout2d(features, 0.5, training=True)
35 |             logits = self.conv(features)
36 | 
37 |         else:
38 |             # At test time, apply dropout multiple times and average the result
39 |             mean_score = 0
40 |             for _ in range(self.num_samples):
41 |                 drop_feats = F.dropout2d(features, 0.5, training=True)
42 |                 mean_score += F.sigmoid(self.conv(drop_feats))
43 |             mean_score = mean_score / self.num_samples
44 | 
45 |             # Convert back into logits format
46 |             logits = torch.log(mean_score) - torch.log1p(-mean_score)
47 |         
48 |         return logits
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/src/nn/fpn.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Adapted from the implementation of
  3 | https://github.com/kuangliu/pytorch-retinanet/
  4 | '''
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | 
  9 | from torchvision.models.utils import load_state_dict_from_url
 10 | 
 11 | from .resnet import ResNetLayer
 12 | 
 13 | 
 14 | class FPN(nn.Module):
 15 |     def __init__(self, num_blocks):
 16 |         super(FPN, self).__init__()
 17 |         self.in_planes = 64
 18 | 
 19 |         self.conv1 = nn.Conv2d(
 20 |             3, 64, kernel_size=7, stride=2, padding=3, bias=False)
 21 |         self.bn1 = nn.BatchNorm2d(64)
 22 | 
 23 |         # Bottom-up layers
 24 |         self.layer1 = ResNetLayer(64, 64, num_blocks[0], stride=1)
 25 |         self.layer2 = ResNetLayer(256, 128, num_blocks[1], stride=2)
 26 |         self.layer3 = ResNetLayer(512, 256, num_blocks[2], stride=2)
 27 |         self.layer4 = ResNetLayer(1024, 512, num_blocks[3], stride=2)
 28 |         self.conv6 = nn.Conv2d(2048, 256, kernel_size=3, stride=2, padding=1)
 29 |         self.conv7 = nn.Conv2d( 256, 256, kernel_size=3, stride=2, padding=1)
 30 | 
 31 |         # Lateral layers
 32 |         self.latlayer1 = nn.Conv2d(2048, 256, kernel_size=1, stride=1, padding=0)
 33 |         self.latlayer2 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0)
 34 |         self.latlayer3 = nn.Conv2d( 512, 256, kernel_size=1, stride=1, padding=0)
 35 | 
 36 |         # Top-down layers
 37 |         self.toplayer1 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
 38 |         self.toplayer2 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
 39 | 
 40 |         # ImageNet normalization parameters
 41 |         self.register_buffer('mean', torch.tensor([0.485, 0.456, 0.406]))
 42 |         self.register_buffer('std', torch.tensor([0.229, 0.224, 0.225]))
 43 | 
 44 |     
 45 |     def load_pretrained(self, path):
 46 |         pretrained = load_state_dict_from_url(path, progress=True)
 47 |         state_dict = self.state_dict()
 48 |         for key, weights in pretrained.items():
 49 |             if key in state_dict:
 50 |                 state_dict[key].copy_(weights)
 51 |         
 52 |         self.load_state_dict(state_dict)
 53 | 
 54 | 
 55 |     def _upsample_add(self, x, y):
 56 |         '''Upsample and add two feature maps.
 57 | 
 58 |         Args:
 59 |           x: (Variable) top feature map to be upsampled.
 60 |           y: (Variable) lateral feature map.
 61 | 
 62 |         Returns:
 63 |           (Variable) added feature map.
 64 | 
 65 |         Note in PyTorch, when input size is odd, the upsampled feature map
 66 |         with `F.upsample(..., scale_factor=2, mode='nearest')`
 67 |         maybe not equal to the lateral feature map size.
 68 | 
 69 |         e.g.
 70 |         original input size: [N,_,15,15] ->
 71 |         conv2d feature map size: [N,_,8,8] ->
 72 |         upsampled feature map size: [N,_,16,16]
 73 | 
 74 |         So we choose bilinear upsample which supports arbitrary output sizes.
 75 |         '''
 76 |         _,_,H,W = y.size()
 77 |         return F.upsample(x, size=(H,W), mode='bilinear') + y
 78 | 
 79 |     def forward(self, x):
 80 | 
 81 |         # Normalize image
 82 |         x = (x - self.mean.view(-1, 1, 1)) / self.std.view(-1, 1, 1)
 83 | 
 84 |         # Bottom-up
 85 |         c1 = F.relu(self.bn1(self.conv1(x)))
 86 |         c1 = F.max_pool2d(c1, kernel_size=3, stride=2, padding=1)
 87 |         c2 = self.layer1(c1)
 88 |         c3 = self.layer2(c2)
 89 |         c4 = self.layer3(c3)
 90 |         c5 = self.layer4(c4)
 91 |         p6 = self.conv6(c5)
 92 |         p7 = self.conv7(F.relu(p6))
 93 |         # Top-down
 94 |         p5 = self.latlayer1(c5)
 95 |         p4 = self._upsample_add(p5, self.latlayer2(c4))
 96 |         p4 = self.toplayer1(p4)
 97 |         p3 = self._upsample_add(p4, self.latlayer3(c3))
 98 |         p3 = self.toplayer2(p3)
 99 |         return p3, p4, p5, p6, p7
100 | 
101 | 
102 | def FPN50():
103 |     fpn = FPN([3,4,6,3])
104 |     fpn.load_pretrained(
105 |         'https://download.pytorch.org/models/resnet50-19c8e357.pth')
106 |     return fpn
107 | 
108 | def FPN101():
109 |     fpn = FPN([2,4,23,3])
110 |     fpn.load_pretrained(
111 |         'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth')
112 |     return fpn
113 | 


--------------------------------------------------------------------------------
/src/nn/losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | INV_LOG2 = 0.693147
 5 | 
 6 | 
 7 | def balanced_binary_cross_entropy(logits, labels, mask, weights):
 8 |     weights = (logits.new(weights).view(-1, 1, 1) - 1) * labels.float() + 1.
 9 |     weights = weights * mask.unsqueeze(1).float()
10 |     return F.binary_cross_entropy_with_logits(logits, labels.float(), weights)
11 | 
12 | 
13 | def uncertainty_loss(x, mask):
14 |     """
15 |     Loss which maximizes the uncertainty in invalid regions of the image
16 |     """
17 |     labels = ~mask
18 |     x = x[labels.unsqueeze(1).expand_as(x)]
19 |     xp, xm = x, -x
20 |     entropy = xp.sigmoid() * F.logsigmoid(xp) + xm.sigmoid() * F.logsigmoid(xm)
21 |     return 1. + entropy.mean() / INV_LOG2
22 | 
23 | 
24 | def prior_uncertainty_loss(x, mask, priors):
25 |     priors = x.new(priors).view(1, -1, 1, 1).expand_as(x)
26 |     xent = F.binary_cross_entropy_with_logits(x, priors, reduce=False)
27 |     return (xent * (~mask).float().unsqueeze(1)).mean() 
28 | 
29 | 
30 | def kl_divergence_loss(mu, logvar):
31 |     return -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
32 | 
33 | 
34 | def focal_loss(logits, labels, mask, alpha=0.5, gamma=2):
35 |     
36 |     bce_loss = F.binary_cross_entropy_with_logits(logits, labels.float(), 
37 |                                                   reduce=False)
38 |     pt = torch.exp(-bce_loss)
39 |     at = pt.new([alpha, 1 - alpha])[labels.long()]
40 |     focal_loss = at * (1 - pt) ** gamma * bce_loss
41 | 
42 |     return (focal_loss * mask.unsqueeze(1).float()).mean()
43 | 
44 | 
45 | def prior_offset_loss(logits, labels, mask, priors):
46 | 
47 |     priors = logits.new(priors).view(-1, 1, 1)
48 |     prior_logits = torch.log(priors / (1 - priors))
49 |     labels = labels.float()
50 | 
51 |     weights = .5 / priors * labels + .5 / (1 - priors) * (1 - labels)
52 |     weights = weights * mask.unsqueeze(1).float()
53 |     return F.binary_cross_entropy_with_logits(logits - prior_logits, labels, 
54 |                                               weights)
55 | 


--------------------------------------------------------------------------------
/src/nn/pyramid.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | from .transformer import DenseTransformer
 6 | 
 7 | class TransformerPyramid(nn.Module):
 8 | 
 9 |     def __init__(self, in_channels, channels, resolution, extents, ymin, ymax, 
10 |                  focal_length):
11 |         
12 |         super().__init__()
13 |         self.transformers = nn.ModuleList()
14 |         for i in range(5):
15 |             
16 |             # Scaled focal length for each transformer
17 |             focal = focal_length / pow(2, i + 3)
18 | 
19 |             # Compute grid bounds for each transformer
20 |             zmax = min(math.floor(focal * 2) * resolution, extents[3])
21 |             zmin = math.floor(focal) * resolution if i < 4 else extents[1]
22 |             subset_extents = [extents[0], zmin, extents[2], zmax]
23 |             # Build transformers
24 |             tfm = DenseTransformer(in_channels, channels, resolution, 
25 |                                    subset_extents, ymin, ymax, focal)
26 |             self.transformers.append(tfm)
27 |     
28 | 
29 |     def forward(self, feature_maps, calib):
30 |         
31 |         bev_feats = list()
32 |         for i, fmap in enumerate(feature_maps):
33 |             
34 |             # Scale calibration matrix to account for downsampling
35 |             scale = 8 * 2 ** i
36 |             calib_downsamp = calib.clone()
37 |             calib_downsamp[:, :2] = calib[:, :2] / scale
38 | 
39 |             # Apply orthographic transformation to each feature map separately
40 |             bev_feats.append(self.transformers[i](fmap, calib_downsamp))
41 |         
42 |         # Combine birds-eye-view feature maps along the depth axis
43 |         return torch.cat(bev_feats[::-1], dim=-2)
44 | 


--------------------------------------------------------------------------------
/src/nn/resampler.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from .. import utils
 6 | 
 7 | class Resampler(nn.Module):
 8 | 
 9 |     def __init__(self, resolution, extents):
10 |         super().__init__()
11 | 
12 |         # Store z positions of the near and far planes
13 |         self.near = extents[1]
14 |         self.far = extents[3]
15 | 
16 |         # Make a grid in the x-z plane
17 |         self.grid = _make_grid(resolution, extents)
18 | 
19 | 
20 |     def forward(self, features, calib):
21 | 
22 |         # Copy grid to the correct device
23 |         self.grid = self.grid.to(features)
24 |         
25 |         # We ignore the image v-coordinate, and assume the world Y-coordinate
26 |         # is zero, so we only need a 2x2 submatrix of the original 3x3 matrix
27 |         calib = calib[:, [0, 2]][..., [0, 2]].view(-1, 1, 1, 2, 2)
28 | 
29 |         # Transform grid center locations into image u-coordinates
30 |         cam_coords = torch.matmul(calib, self.grid.unsqueeze(-1)).squeeze(-1)
31 | 
32 |         # Apply perspective projection and normalize
33 |         ucoords = cam_coords[..., 0] / cam_coords[..., 1]
34 |         ucoords = ucoords / features.size(-1) * 2 - 1
35 | 
36 |         # Normalize z coordinates
37 |         zcoords = (cam_coords[..., 1]-self.near) / (self.far-self.near) * 2 - 1
38 | 
39 |         # Resample 3D feature map
40 |         grid_coords = torch.stack([ucoords, zcoords], -1).clamp(-1.1, 1.1)
41 |         return F.grid_sample(features, grid_coords)
42 | 
43 | 
44 | def _make_grid(resolution, extents):
45 |     # Create a grid of cooridinates in the birds-eye-view
46 |     x1, z1, x2, z2 = extents
47 |     zz, xx = torch.meshgrid(
48 |         torch.arange(z1, z2, resolution), torch.arange(x1, x2, resolution))
49 | 
50 |     return torch.stack([xx, zz], dim=-1)


--------------------------------------------------------------------------------
/src/nn/resnet.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | def conv3x3(in_planes, out_planes, stride=1, dilation=1):
  6 |     """3x3 convolution with padding"""
  7 | 
  8 |     # Fractional strides correspond to transpose convolution
  9 |     if stride < 1:
 10 |         stride = int(round(1 / stride))
 11 |         kernel_size = stride + 2
 12 |         padding = int((dilation * (kernel_size - 1) - stride + 1) / 2)
 13 |         return nn.ConvTranspose2d(
 14 |             in_planes, out_planes, kernel_size, stride, padding, 
 15 |             output_padding=0, dilation=dilation, bias=False)
 16 |     
 17 |     # Otherwise return normal convolution
 18 |     return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=int(stride),
 19 |                      dilation=dilation, padding=dilation, bias=False)
 20 | 
 21 | 
 22 | def conv1x1(in_planes, out_planes, stride=1):
 23 |     """1x1 convolution"""
 24 | 
 25 |     # Fractional strides correspond to transpose convolution
 26 |     if int(1 / stride) > 1:
 27 |         stride = int(1 / stride)
 28 |         return nn.ConvTranspose2d(
 29 |             in_planes, out_planes, kernel_size=stride, stride=stride,bias=False)
 30 |     
 31 |     return nn.Conv2d(
 32 |         in_planes, out_planes, kernel_size=1, stride=int(stride), bias=False)
 33 | 
 34 | 
 35 | class BasicBlock(nn.Module):
 36 |     expansion = 1
 37 | 
 38 |     def __init__(self, inplanes, planes, stride=1, dilation=1):
 39 |         super(BasicBlock, self).__init__()
 40 | 
 41 |         self.conv1 = conv3x3(inplanes, planes, stride, dilation)
 42 |         self.bn1 = nn.GroupNorm(16, planes)
 43 | 
 44 |         self.conv2 = conv3x3(planes, planes, 1, dilation)
 45 |         self.bn2 = nn.GroupNorm(16, planes)
 46 | 
 47 |         if stride != 1 or inplanes != planes:
 48 |             self.downsample = nn.Sequential(
 49 |                 conv1x1(inplanes, planes, stride), nn.GroupNorm(16, planes))
 50 |         else:
 51 |             self.downsample = None
 52 | 
 53 | 
 54 |     def forward(self, x):
 55 |         identity = x
 56 | 
 57 |         out = F.relu(self.bn1(self.conv1(x)), inplace=True)
 58 |         out = self.bn2(self.conv2(out))
 59 | 
 60 |         if self.downsample is not None:
 61 |             identity = self.downsample(x)
 62 | 
 63 |         out += identity
 64 |         out = F.relu(out, inplace=True)
 65 | 
 66 |         return out
 67 | 
 68 | 
 69 | class Bottleneck(nn.Module):
 70 |     expansion = 4
 71 | 
 72 |     def __init__(self, inplanes, planes, stride=1, dilation=1):
 73 |         super(Bottleneck, self).__init__()
 74 |         self.conv1 = conv1x1(inplanes, planes)
 75 |         self.bn1 = nn.GroupNorm(16, planes)
 76 |         self.conv2 = conv3x3(planes, planes, stride, dilation)
 77 |         self.bn2 = nn.GroupNorm(16, planes)
 78 |         self.conv3 = conv1x1(planes, planes * self.expansion)
 79 |         self.bn3 = nn.GroupNorm(16, planes * self.expansion)
 80 | 
 81 |         if stride != 1 or inplanes != planes * self.expansion:
 82 |             self.downsample = nn.Sequential(
 83 |                 conv1x1(inplanes, planes * self.expansion, stride), 
 84 |                 nn.GroupNorm(16, planes * self.expansion))
 85 |         else:
 86 |             self.downsample = None
 87 | 
 88 |     def forward(self, x):
 89 |         identity = x
 90 | 
 91 |         out = F.relu(self.bn1(self.conv1(x)), inplace=True)
 92 |         out = F.relu(self.bn2(self.conv2(out)), inplace=True)
 93 |         out = self.bn3(self.conv3(out))
 94 |  
 95 |         if self.downsample is not None:
 96 |             identity = self.downsample(x)
 97 | 
 98 |         out += identity
 99 |         out = F.relu(out)
100 | 
101 |         return out
102 | 
103 | 
104 | class ResNetLayer(nn.Sequential):
105 | 
106 |     def __init__(self, in_channels, channels, num_blocks, stride=1, 
107 |                  dilation=1, blocktype='bottleneck'):
108 |         
109 |         # Get block type
110 |         if blocktype == 'basic':
111 |             block = BasicBlock
112 |         elif blocktype == 'bottleneck':
113 |             block = Bottleneck
114 |         else:
115 |             raise Exception("Unknown residual block type: " + str(blocktype))
116 |         
117 |         # Construct layers
118 |         layers = [block(in_channels, channels, stride, dilation)]
119 |         for _ in range(1, num_blocks):
120 |             layers.append(block(channels * block.expansion, channels, 1, dilation))
121 | 
122 |         self.in_channels = in_channels
123 |         self.out_channels = channels * block.expansion
124 | 
125 |         super(ResNetLayer, self).__init__(*layers)


--------------------------------------------------------------------------------
/src/nn/topdown.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from .resnet import ResNetLayer
 6 | 
 7 | class TopdownNetwork(nn.Sequential):
 8 | 
 9 |     def __init__(self, in_channels, channels, layers=[6, 1, 1], 
10 |                  strides=[1, 2, 2], blocktype='basic'):
11 |         
12 |         modules = list()
13 |         self.downsample = 1
14 |         for nblocks, stride in zip(layers, strides):
15 | 
16 |             # Add a new residual layer
17 |             module = ResNetLayer(
18 |                 in_channels, channels, nblocks, 1/stride, blocktype=blocktype)
19 |             modules.append(module)
20 | 
21 |             # Halve the number of channels at each layer
22 |             in_channels = module.out_channels
23 |             channels = channels // 2
24 |             self.downsample *= stride
25 |         
26 |         self.out_channels = in_channels
27 | 
28 |         
29 |         super().__init__(*modules)


--------------------------------------------------------------------------------
/src/nn/transformer.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | from .resampler import Resampler
 7 | 
 8 | class DenseTransformer(nn.Module):
 9 | 
10 |     def __init__(self, in_channels, channels, resolution, grid_extents, 
11 |                  ymin, ymax, focal_length, groups=1):
12 |         super().__init__()
13 | 
14 |         # Initial convolution to reduce feature dimensions
15 |         self.conv = nn.Conv2d(in_channels, channels, 1)
16 |         self.bn = nn.GroupNorm(16, channels)
17 | 
18 |         # Resampler transforms perspective features to BEV
19 |         self.resampler = Resampler(resolution, grid_extents)
20 | 
21 |         # Compute input height based on region of image covered by grid
22 |         self.zmin, zmax = grid_extents[1], grid_extents[3]
23 |         self.in_height = math.ceil(focal_length * (ymax - ymin) / self.zmin)
24 |         self.ymid = (ymin + ymax) / 2
25 | 
26 |         # Compute number of output cells required
27 |         self.out_depth = math.ceil((zmax - self.zmin) / resolution)
28 | 
29 |         # Dense layer which maps UV features to UZ
30 |         self.fc = nn.Conv1d(
31 |             channels * self.in_height, channels * self.out_depth, 1, groups=groups
32 |         )
33 |         self.out_channels = channels
34 |     
35 | 
36 |     def forward(self, features, calib, *args):
37 | 
38 |         # Crop feature maps to a fixed input height
39 |         features = torch.stack([self._crop_feature_map(fmap, cal) 
40 |                                 for fmap, cal in zip(features, calib)])
41 |         
42 |         # Reduce feature dimension to minimize memory usage
43 |         features = F.relu(self.bn(self.conv(features)))
44 | 
45 |         # Flatten height and channel dimensions
46 |         B, C, _, W = features.shape
47 |         flat_feats = features.flatten(1, 2)
48 |         bev_feats = self.fc(flat_feats).view(B, C, -1, W)
49 | 
50 |         # Resample to orthographic grid
51 |         return self.resampler(bev_feats, calib)
52 | 
53 | 
54 |     def _crop_feature_map(self, fmap, calib):
55 |         
56 |         # Compute upper and lower bounds of visible region
57 |         focal_length, img_offset = calib[1, 1:]
58 |         vmid = self.ymid * focal_length / self.zmin + img_offset
59 |         vmin = math.floor(vmid - self.in_height / 2)
60 |         vmax = math.floor(vmid + self.in_height / 2)
61 | 
62 |         # Pad or crop input tensor to match dimensions
63 |         return F.pad(fmap, [0, 0, -vmin, vmax - fmap.shape[-2]])


--------------------------------------------------------------------------------
/src/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .geometry import *


--------------------------------------------------------------------------------
/src/utils/configs.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | from yacs.config import CfgNode
 4 | try:
 5 |     from dotenv import load_dotenv
 6 |     load_dotenv()
 7 | except:
 8 |     pass
 9 | 
10 | ROOT = os.path.abspath(os.path.join(__file__, '..', '..', '..'))
11 | 
12 | 
13 | def load_config(config_path):
14 |     with open(config_path) as f:
15 |         return CfgNode.load_cfg(f)
16 | 
17 | def get_default_configuration():
18 |     defaults_path = os.path.join(ROOT, 'configs/defaults.yml')
19 |     return load_config(defaults_path)
20 |     
21 | 
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/src/utils/confusion.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | class BinaryConfusionMatrix(object):
 4 | 
 5 |     def __init__(self, num_class):
 6 |         self.tp = torch.zeros(num_class, dtype=torch.long)
 7 |         self.fp = torch.zeros(num_class, dtype=torch.long)
 8 |         self.fn = torch.zeros(num_class, dtype=torch.long)
 9 |         self.tn = torch.zeros(num_class, dtype=torch.long)
10 | 
11 | 
12 |     @property
13 |     def num_class(self):
14 |         return len(self.tp)
15 |     
16 |     def update(self, preds, labels, mask=None):
17 | 
18 |         preds = preds.detach().cpu()
19 |         labels = labels.detach().cpu()
20 | 
21 |         # Move batch dimension to the end
22 |         preds = preds.flatten(2, -1).permute(1, 0, 2).reshape(
23 |             self.num_class, -1)
24 |         labels = labels.flatten(2, -1).permute(1, 0, 2).reshape(
25 |             self.num_class, -1)
26 | 
27 |         if mask is not None:
28 |             preds = preds[:, mask.flatten()]
29 |             labels = labels[:, mask.flatten()]
30 |         
31 | 
32 |         true_pos = preds & labels
33 |         false_pos = preds & ~labels
34 |         false_neg = ~preds & labels
35 |         true_neg = ~preds & ~labels
36 | 
37 |         # Update global counts
38 |         self.tp += true_pos.long().sum(-1)
39 |         self.fp += false_pos.long().sum(-1)
40 |         self.fn += false_neg.long().sum(-1)
41 |         self.tn += true_neg.long().sum(-1)
42 |     
43 | 
44 |     @property
45 |     def iou(self):
46 |         return self.tp.float() / (self.tp + self.fn + self.fp).float()
47 |     
48 |     @property
49 |     def mean_iou(self):
50 |         # Only compute mean over classes with at least one ground truth
51 |         valid = (self.tp + self.fn) > 0
52 |         if not valid.any():
53 |             return 0
54 |         return float(self.iou[valid].mean())
55 | 
56 |     @property
57 |     def dice(self):
58 |         return 2 * self.tp.float() / (2 * self.tp + self.fp + self.fn).float()
59 |     
60 |     @property
61 |     def macro_dice(self):
62 |         valid = (self.tp + self.fn) > 0
63 |         if not valid.any():
64 |             return 0
65 |         return float(self.dice[valid].mean())
66 |     
67 |     @property
68 |     def precision(self):
69 |         return self.tp.float() / (self.tp + self.fp).float()
70 |     
71 |     @property
72 |     def recall(self):
73 |         return self.tp.float() / (self.tp + self.fn).float()


--------------------------------------------------------------------------------
/src/utils/geometry.py:
--------------------------------------------------------------------------------
 1 | from collections import Iterable
 2 | import torch
 3 | 
 4 | def make_grid(grid_size, cell_size=None, grid_offset=None):
 5 |     """Construct an N-dimensional grid"""
 6 | 
 7 |     # Handle default or non-tuple cell_sizes
 8 |     if cell_size is None:
 9 |         cell_size = [1.] * len(grid_size)
10 |     elif not isinstance(cell_size, Iterable):
11 |         cell_size = [cell_size] * len(grid_size)
12 |     
13 |     # By default the grid offset is set to zero
14 |     if grid_offset is None:
15 |         grid_offset = [0.] * len(grid_size)
16 |     
17 |     coords = [torch.arange(0, gs, cs) + off for gs, cs, off 
18 |               in zip(grid_size, cell_size, grid_offset)]
19 |     grid = torch.meshgrid(*coords[::-1])[::-1]
20 |     return torch.stack(grid, -1)


--------------------------------------------------------------------------------
/src/utils/visualise.py:
--------------------------------------------------------------------------------
 1 | from matplotlib.cm import get_cmap
 2 | 
 3 | def colorise(tensor, cmap, vmin=None, vmax=None):
 4 | 
 5 |     if isinstance(cmap, str):
 6 |         cmap = get_cmap(cmap)
 7 |     
 8 |     tensor = tensor.detach().cpu().float()
 9 | 
10 |     vmin = float(tensor.min()) if vmin is None else vmin
11 |     vmax = float(tensor.max()) if vmax is None else vmax
12 | 
13 |     tensor = (tensor - vmin) / (vmax - vmin)
14 |     return cmap(tensor.numpy())[..., :3]


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | from datetime import datetime
  4 | from argparse import ArgumentParser
  5 | from tqdm import tqdm
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | from torch.optim import SGD
 10 | from torch.optim.lr_scheduler import MultiStepLR
 11 | from torch.utils.tensorboard import SummaryWriter
 12 | 
 13 | from src.models.model_factory import build_model, build_criterion
 14 | from src.data.data_factory import build_dataloaders
 15 | from src.utils.configs import get_default_configuration, load_config
 16 | from src.utils.confusion import BinaryConfusionMatrix
 17 | from src.data.nuscenes.utils import NUSCENES_CLASS_NAMES
 18 | from src.data.argoverse.utils import ARGOVERSE_CLASS_NAMES
 19 | from src.utils.visualise import colorise
 20 | 
 21 | def train(dataloader, model, criterion, optimiser, summary, config, epoch):
 22 | 
 23 |     model.train()
 24 | 
 25 |     # Compute prior probability of occupancy
 26 |     prior = torch.tensor(config.prior)
 27 |     prior_log_odds = torch.log(prior / (1 - prior))
 28 | 
 29 |     # Initialise confusion matrix
 30 |     confusion = BinaryConfusionMatrix(config.num_class)
 31 |     
 32 |     # Iterate over dataloader
 33 |     iteration = (epoch - 1) * len(dataloader)
 34 |     for i, batch in enumerate(tqdm(dataloader)):
 35 | 
 36 |         # Move tensors to GPU
 37 |         if len(config.gpus) > 0:
 38 |             batch = [t.cuda() for t in batch]
 39 |         
 40 |         # Predict class occupancy scores and compute loss
 41 |         image, calib, labels, mask = batch
 42 |         if config.model == 'ved':
 43 |             logits, mu, logvar = model(image)
 44 |             loss = criterion(logits, labels, mask, mu, logvar)
 45 |         else:
 46 |             logits = model(image, calib)
 47 |             loss = criterion(logits, labels, mask)
 48 | 
 49 | 
 50 |         # Compute gradients and update parameters
 51 |         optimiser.zero_grad()
 52 |         loss.backward()
 53 |         optimiser.step()
 54 | 
 55 |         # Update confusion matrix
 56 |         scores = logits.cpu().sigmoid()  
 57 |         confusion.update(scores > config.score_thresh, labels, mask)
 58 | 
 59 |         # Update tensorboard
 60 |         if i % config.log_interval == 0:
 61 |             summary.add_scalar('train/loss', float(loss), iteration)
 62 | 
 63 |         # Visualise
 64 |         if i % config.vis_interval == 0:
 65 |             visualise(summary, image, scores, labels, mask, iteration, 
 66 |                       config.train_dataset, split='train')
 67 |                 
 68 |         iteration += 1
 69 | 
 70 |     # Print and record results
 71 |     display_results(confusion, config.train_dataset)
 72 |     log_results(confusion, config.train_dataset, summary, 'train', epoch)
 73 | 
 74 | 
 75 | 
 76 | def evaluate(dataloader, model, criterion, summary, config, epoch):
 77 | 
 78 |     model.eval()
 79 | 
 80 |     # Compute prior probability of occupancy
 81 |     prior = torch.tensor(config.prior)
 82 |     prior_log_odds = torch.log(prior / (1 - prior))
 83 | 
 84 |     # Initialise confusion matrix
 85 |     confusion = BinaryConfusionMatrix(config.num_class)
 86 |     
 87 |     # Iterate over dataset
 88 |     for i, batch in enumerate(tqdm(dataloader)):
 89 | 
 90 |         # Move tensors to GPU
 91 |         if len(config.gpus) > 0:
 92 |             batch = [t.cuda() for t in batch]
 93 |         
 94 |         # Predict class occupancy scores and compute loss
 95 |         image, calib, labels, mask = batch
 96 |         with torch.no_grad():
 97 |             if config.model == 'ved':
 98 |                 logits, mu, logvar = model(image)
 99 |                 loss = criterion(logits, labels, mask, mu, logvar)
100 |             else:
101 |                 logits = model(image, calib)
102 |                 loss = criterion(logits, labels, mask)
103 | 
104 |         # Update confusion matrix
105 |         scores = logits.cpu().sigmoid()  
106 |         confusion.update(scores > config.score_thresh, labels, mask)
107 | 
108 |         # Update tensorboard
109 |         if i % config.log_interval == 0:
110 |             summary.add_scalar('val/loss', float(loss), epoch)
111 |         
112 |         # Visualise
113 |         if i % config.vis_interval == 0:
114 |             visualise(summary, image, scores, labels, mask, epoch, 
115 |                       config.train_dataset, split='val')
116 | 
117 |     # Print and record results
118 |     display_results(confusion, config.train_dataset)
119 |     log_results(confusion, config.train_dataset, summary, 'val', epoch)
120 | 
121 |     return confusion.mean_iou
122 | 
123 | 
124 | def visualise(summary, image, scores, labels, mask, step, dataset, split):
125 | 
126 |     class_names = NUSCENES_CLASS_NAMES if dataset == 'nuscenes' \
127 |         else ARGOVERSE_CLASS_NAMES
128 | 
129 |     summary.add_image(split + '/image', image[0], step, dataformats='CHW')
130 |     summary.add_image(split + '/pred', colorise(scores[0], 'coolwarm', 0, 1),
131 |                       step, dataformats='NHWC')
132 |     summary.add_image(split + '/gt', colorise(labels[0], 'coolwarm', 0, 1),
133 |                       step, dataformats='NHWC')
134 | 
135 |     
136 |     # for i, name in enumerate(class_names):
137 |     #     summary.add_image(split + '/pred/' + name, scores[0, i], step, 
138 |     #                       dataformats='HW')
139 |     #     summary.add_image(split + '/gt/' + name, labels[0, i], step, 
140 |     #                       dataformats='HW')
141 |     
142 |     # summary.add_image(split + '/mask', mask[0], step, dataformats='HW')
143 | 
144 | 
145 | def display_results(confusion, dataset):
146 | 
147 |     # Display confusion matrix summary
148 |     class_names = NUSCENES_CLASS_NAMES if dataset == 'nuscenes' \
149 |         else ARGOVERSE_CLASS_NAMES
150 |     
151 |     print('\nResults:')
152 |     for name, iou_score in zip(class_names, confusion.iou):
153 |         print('{:20s} {:.3f}'.format(name, iou_score)) 
154 |     print('{:20s} {:.3f}'.format('MEAN', confusion.mean_iou))
155 | 
156 | 
157 | 
158 | def log_results(confusion, dataset, summary, split, epoch):
159 | 
160 |     # Display and record epoch IoU scores
161 |     class_names = NUSCENES_CLASS_NAMES if dataset == 'nuscenes' \
162 |         else ARGOVERSE_CLASS_NAMES
163 | 
164 |     for name, iou_score in zip(class_names, confusion.iou):
165 |         summary.add_scalar(f'{split}/iou/{name}', iou_score, epoch)
166 |     summary.add_scalar(f'{split}/iou/MEAN', confusion.mean_iou, epoch)
167 | 
168 | 
169 | 
170 | def save_checkpoint(path, model, optimizer, scheduler, epoch, best_iou):
171 | 
172 |     if isinstance(model, nn.DataParallel):
173 |         model = model.module
174 |     
175 |     ckpt = {
176 |         'model' : model.state_dict(),
177 |         'optimizer' : optimizer.state_dict(),
178 |         'scheduler' : scheduler.state_dict(),
179 |         'epoch' : epoch,
180 |         'best_iou' : best_iou
181 |     }
182 | 
183 |     torch.save(ckpt, path)
184 | 
185 | 
186 | def load_checkpoint(path, model, optimizer, scheduler):
187 |     
188 |     ckpt = torch.load(path)
189 | 
190 |     # Load model weights
191 |     if isinstance(model, nn.DataParallel):
192 |         model = model.module
193 |     model.load_state_dict(ckpt['model'])
194 | 
195 |     # Load optimiser state
196 |     optimizer.load_state_dict(ckpt['optimizer'])
197 | 
198 |     # Load scheduler state
199 |     scheduler.load_state_dict(ckpt['scheduler'])
200 | 
201 |     return ckpt['epoch'], ckpt['best_iou']
202 | 
203 | 
204 | 
205 | # Load the configuration for this experiment
206 | def get_configuration(args):
207 | 
208 |     # Load config defaults
209 |     config = get_default_configuration()
210 | 
211 |     # Load dataset options
212 |     config.merge_from_file(f'configs/datasets/{args.dataset}.yml')
213 | 
214 |     # Load model options
215 |     config.merge_from_file(f'configs/models/{args.model}.yml')
216 | 
217 |     # Load experiment options
218 |     config.merge_from_file(f'configs/experiments/{args.experiment}.yml')
219 | 
220 |     # Restore config from an existing experiment
221 |     if args.resume is not None:
222 |         config.merge_from_file(os.path.join(args.resume, 'config.yml'))
223 |     
224 |     # Override with command line options
225 |     config.merge_from_list(args.options)
226 | 
227 |     # Finalise config
228 |     config.freeze()
229 | 
230 |     return config
231 | 
232 | 
233 | def create_experiment(config, tag, resume=None):
234 | 
235 |     # Restore an existing experiment if a directory is specified
236 |     if resume is not None:
237 |         print("\n==> Restoring experiment from directory:\n" + resume)
238 |         logdir = resume
239 |     else:
240 |         # Otherwise, generate a run directory based on the current time
241 |         name = datetime.now().strftime('{}_%y-%m-%d--%H-%M-%S').format(tag)
242 |         logdir = os.path.join(os.path.expandvars(config.logdir), name)
243 |         print("\n==> Creating new experiment in directory:\n" + logdir)
244 |         os.makedirs(logdir)
245 |     
246 |     # Display the config options on-screen
247 |     print(config.dump())
248 |     
249 |     # Save the current config
250 |     with open(os.path.join(logdir, 'config.yml'), 'w') as f:
251 |         f.write(config.dump())
252 |     
253 |     return logdir
254 | 
255 | 
256 | 
257 |     
258 | 
259 | 
260 | 
261 | 
262 | def main():
263 | 
264 |     parser = ArgumentParser()
265 |     parser.add_argument('--tag', type=str, default='run',
266 |                         help='optional tag to identify the run')
267 |     parser.add_argument('--dataset', choices=['nuscenes', 'argoverse'],
268 |                         default='nuscenes', help='dataset to train on')
269 |     parser.add_argument('--model', choices=['pyramid', 'vpn', 'ved'],
270 |                         default='pyramid', help='model to train')
271 |     parser.add_argument('--experiment', default='test', 
272 |                         help='name of experiment config to load')
273 |     parser.add_argument('--resume', default=None, 
274 |                         help='path to an experiment to resume')
275 |     parser.add_argument('--options', nargs='*', default=[],
276 |                         help='list of addition config options as key-val pairs')
277 |     args = parser.parse_args()
278 | 
279 |     # Load configuration
280 |     config = get_configuration(args)
281 |     
282 |     # Create a directory for the experiment
283 |     logdir = create_experiment(config, args.tag, args.resume)
284 | 
285 |     # Create tensorboard summary 
286 |     summary = SummaryWriter(logdir)
287 | 
288 |     # Set default device
289 |     if len(config.gpus) > 0:
290 |         torch.cuda.set_device(config.gpus[0])
291 | 
292 |     # Setup experiment
293 |     model = build_model(config.model, config)
294 |     criterion = build_criterion(config.model, config)
295 |     train_loader, val_loader = build_dataloaders(config.train_dataset, config)
296 | 
297 |     # Build optimiser and learning rate scheduler
298 |     optimiser = SGD(model.parameters(), config.learning_rate, 
299 |                     weight_decay=config.weight_decay)
300 |     lr_scheduler = MultiStepLR(optimiser, config.lr_milestones, 0.1)
301 | 
302 |     # Load checkpoint
303 |     if args.resume:
304 |         epoch, best_iou = load_checkpoint(os.path.join(logdir, 'latest.pth'),
305 |                                           model, optimiser, lr_scheduler)
306 |     else:
307 |         epoch, best_iou = 1, 0
308 | 
309 |     # Main training loop
310 |     while epoch <= config.num_epochs:
311 |         
312 |         print('\n\n=== Beginning epoch {} of {} ==='.format(epoch, 
313 |                                                             config.num_epochs))
314 |         
315 |         # Train model for one epoch
316 |         train(train_loader, model, criterion, optimiser, summary, config, epoch)
317 | 
318 |         # Evaluate on the validation set
319 |         val_iou = evaluate(val_loader, model, criterion, summary, config, epoch)
320 | 
321 |         # Update learning rate
322 |         lr_scheduler.step()
323 | 
324 |         # Save checkpoints
325 |         if val_iou > best_iou:
326 |             best_iou = val_iou
327 |             save_checkpoint(os.path.join(logdir, 'best.pth'), model, 
328 |                             optimiser, lr_scheduler, epoch, best_iou)
329 |         
330 |         save_checkpoint(os.path.join(logdir, 'latest.pth'), model, optimiser, 
331 |                         lr_scheduler, epoch, best_iou)
332 |         
333 |         epoch += 1
334 |     
335 |     print("\nTraining complete!")
336 | 
337 | 
338 | 
339 | if __name__ == '__main__':
340 |     main()
341 | 
342 |                 
343 | 
344 | 
345 | 
346 |     
347 | 
348 | 
349 | 
350 | 
351 | 
352 | 
353 | 
354 | 
355 | 
356 | 
357 | 
358 | 


--------------------------------------------------------------------------------